k-mktr commited on
Commit
631fbda
β€’
1 Parent(s): 10c8168

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +360 -0
app.py ADDED
@@ -0,0 +1,360 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from functools import lru_cache
3
+ import random
4
+ import requests
5
+ import logging
6
+ import arena_config
7
+ import plotly.graph_objects as go
8
+ from typing import Dict
9
+ from leaderboard import get_current_leaderboard, update_leaderboard
10
+
11
+ # Initialize logging for errors only
12
+ logging.basicConfig(level=logging.ERROR)
13
+ logger = logging.getLogger(__name__)
14
+
15
+ # Function to get available models (using predefined list)
16
+ def get_available_models():
17
+ return [model[0] for model in arena_config.APPROVED_MODELS]
18
+
19
+ # Function to call Ollama API with caching
20
+ @lru_cache(maxsize=100)
21
+ def call_ollama_api(model, prompt):
22
+ payload = {
23
+ "model": model,
24
+ "messages": [{"role": "user", "content": prompt}],
25
+ }
26
+ try:
27
+ response = requests.post(
28
+ f"{arena_config.API_URL}/v1/chat/completions",
29
+ headers=arena_config.HEADERS,
30
+ json=payload,
31
+ timeout=100
32
+ )
33
+ response.raise_for_status()
34
+ data = response.json()
35
+ return data["choices"][0]["message"]["content"]
36
+ except requests.exceptions.RequestException as e:
37
+ logger.error(f"Error calling Ollama API for model {model}: {e}")
38
+ return f"Error: Unable to get response from the model."
39
+
40
+ # Generate responses using two randomly selected models
41
+ def generate_responses(prompt):
42
+ available_models = get_available_models()
43
+ if len(available_models) < 2:
44
+ return "Error: Not enough models available", "Error: Not enough models available", None, None
45
+
46
+ selected_models = random.sample(available_models, 2)
47
+ model_a, model_b = selected_models
48
+
49
+ model_a_response = call_ollama_api(model_a, prompt)
50
+ model_b_response = call_ollama_api(model_b, prompt)
51
+
52
+ return model_a_response, model_b_response, model_a, model_b
53
+
54
+ def battle_arena(prompt):
55
+ response_a, response_b, model_a, model_b = generate_responses(prompt)
56
+
57
+ nickname_a = random.choice(arena_config.model_nicknames)
58
+ nickname_b = random.choice(arena_config.model_nicknames)
59
+
60
+ # Format responses for gr.Chatbot
61
+ response_a_formatted = [{"role": "assistant", "content": response_a}]
62
+ response_b_formatted = [{"role": "assistant", "content": response_b}]
63
+
64
+ if random.choice([True, False]):
65
+ return (
66
+ response_a_formatted, response_b_formatted, model_a, model_b,
67
+ gr.update(label=nickname_a, value=response_a_formatted),
68
+ gr.update(label=nickname_b, value=response_b_formatted),
69
+ gr.update(interactive=True, value=f"Vote for {nickname_a}"),
70
+ gr.update(interactive=True, value=f"Vote for {nickname_b}")
71
+ )
72
+ else:
73
+ return (
74
+ response_b_formatted, response_a_formatted, model_b, model_a,
75
+ gr.update(label=nickname_a, value=response_b_formatted),
76
+ gr.update(label=nickname_b, value=response_a_formatted),
77
+ gr.update(interactive=True, value=f"Vote for {nickname_a}"),
78
+ gr.update(interactive=True, value=f"Vote for {nickname_b}")
79
+ )
80
+
81
+ def record_vote(prompt, left_response, right_response, left_model, right_model, choice):
82
+ # Check if outputs are generated
83
+ if not left_response or not right_response or not left_model or not right_model:
84
+ return (
85
+ "Please generate responses before voting.",
86
+ gr.update(),
87
+ gr.update(interactive=False),
88
+ gr.update(interactive=False),
89
+ gr.update(visible=False),
90
+ gr.update()
91
+ )
92
+
93
+ winner = left_model if choice == "Left is better" else right_model
94
+ loser = right_model if choice == "Left is better" else left_model
95
+
96
+ # Update the leaderboard
97
+ battle_results = update_leaderboard(winner, loser)
98
+
99
+ result_message = f"""
100
+ πŸŽ‰ Vote recorded! You're awesome! 🌟
101
+
102
+ πŸ”΅ In the left corner: {get_human_readable_name(left_model)}
103
+ πŸ”΄ In the right corner: {get_human_readable_name(right_model)}
104
+
105
+ πŸ† And the champion you picked is... {get_human_readable_name(winner)}! πŸ₯‡
106
+ """
107
+
108
+ return (
109
+ gr.update(value=result_message, visible=True), # Show result as Markdown
110
+ get_leaderboard(), # Update leaderboard
111
+ gr.update(interactive=False), # Disable left vote button
112
+ gr.update(interactive=False), # Disable right vote button
113
+ gr.update(visible=True), # Show model names
114
+ get_leaderboard_chart() # Update leaderboard chart
115
+ )
116
+
117
+ def get_leaderboard():
118
+ battle_results = get_current_leaderboard()
119
+ sorted_results = sorted(
120
+ battle_results.items(),
121
+ key=lambda x: (x[1]["wins"], -x[1]["losses"]),
122
+ reverse=True
123
+ )
124
+ leaderboard = """
125
+ <style>
126
+ .leaderboard-table {
127
+ width: 100%;
128
+ border-collapse: collapse;
129
+ font-family: Arial, sans-serif;
130
+ }
131
+ .leaderboard-table th, .leaderboard-table td {
132
+ border: 1px solid #ddd;
133
+ padding: 8px;
134
+ text-align: left;
135
+ }
136
+ .leaderboard-table th {
137
+ background-color: rgba(255, 255, 255, 0.1);
138
+ font-weight: bold;
139
+ }
140
+ .rank-column {
141
+ width: 60px;
142
+ text-align: center;
143
+ }
144
+ .opponent-details {
145
+ font-size: 0.9em;
146
+ color: #888;
147
+ }
148
+ </style>
149
+ <table class='leaderboard-table'>
150
+ <tr>
151
+ <th class='rank-column'>Rank</th>
152
+ <th>Model</th>
153
+ <th>Wins</th>
154
+ <th>Losses</th>
155
+ <th>Win Rate</th>
156
+ <th>Total Battles</th>
157
+ <th>Top Rival</th>
158
+ <th>Toughest Opponent</th>
159
+ </tr>
160
+ """
161
+ for index, (model, results) in enumerate(sorted_results, start=1):
162
+ total_battles = results["wins"] + results["losses"]
163
+ win_rate = (results["wins"] / total_battles * 100) if total_battles > 0 else 0
164
+
165
+ if index == 1:
166
+ rank_display = "πŸ₯‡"
167
+ elif index == 2:
168
+ rank_display = "πŸ₯ˆ"
169
+ elif index == 3:
170
+ rank_display = "πŸ₯‰"
171
+ else:
172
+ rank_display = f"{index}"
173
+
174
+ # Find top rival (most wins against)
175
+ top_rival = max(results["opponents"].items(), key=lambda x: x[1]["wins"], default=(None, {"wins": 0}))
176
+ top_rival_name = get_human_readable_name(top_rival[0]) if top_rival[0] else "N/A"
177
+ top_rival_wins = top_rival[1]["wins"]
178
+
179
+ # Find toughest opponent (most losses against)
180
+ toughest_opponent = max(results["opponents"].items(), key=lambda x: x[1]["losses"], default=(None, {"losses": 0}))
181
+ toughest_opponent_name = get_human_readable_name(toughest_opponent[0]) if toughest_opponent[0] else "N/A"
182
+ toughest_opponent_losses = toughest_opponent[1]["losses"]
183
+
184
+ leaderboard += f"""
185
+ <tr>
186
+ <td class='rank-column'>{rank_display}</td>
187
+ <td>{get_human_readable_name(model)}</td>
188
+ <td>{results['wins']}</td>
189
+ <td>{results['losses']}</td>
190
+ <td>{win_rate:.2f}%</td>
191
+ <td>{total_battles}</td>
192
+ <td class='opponent-details'>{top_rival_name} (W: {top_rival_wins})</td>
193
+ <td class='opponent-details'>{toughest_opponent_name} (L: {toughest_opponent_losses})</td>
194
+ </tr>
195
+ """
196
+ leaderboard += "</table>"
197
+ return leaderboard
198
+
199
+ def get_leaderboard_chart():
200
+ battle_results = get_current_leaderboard()
201
+ sorted_results = sorted(
202
+ battle_results.items(),
203
+ key=lambda x: (x[1]["wins"], -x[1]["losses"]),
204
+ reverse=True
205
+ )
206
+ models = [get_human_readable_name(model) for model, _ in sorted_results]
207
+ wins = [results["wins"] for _, results in sorted_results]
208
+ losses = [results["losses"] for _, results in sorted_results]
209
+
210
+ fig = go.Figure()
211
+
212
+ # Stacked Bar chart for Wins and Losses
213
+ fig.add_trace(go.Bar(
214
+ x=models,
215
+ y=wins,
216
+ name='Wins',
217
+ marker_color='#22577a'
218
+ ))
219
+ fig.add_trace(go.Bar(
220
+ x=models,
221
+ y=losses,
222
+ name='Losses',
223
+ marker_color='#38a3a5'
224
+ ))
225
+
226
+ # Update layout for full-width and increased height
227
+ fig.update_layout(
228
+ title='Model Performance',
229
+ xaxis_title='Models',
230
+ yaxis_title='Number of Battles',
231
+ barmode='stack',
232
+ height=800,
233
+ width=1450,
234
+ autosize=True,
235
+ legend=dict(
236
+ orientation='h',
237
+ yanchor='bottom',
238
+ y=1.02,
239
+ xanchor='right',
240
+ x=1
241
+ )
242
+ )
243
+
244
+ return fig
245
+
246
+ def new_battle():
247
+ nickname_a = random.choice(arena_config.model_nicknames)
248
+ nickname_b = random.choice(arena_config.model_nicknames)
249
+ return (
250
+ "", # Reset prompt_input
251
+ gr.update(value=[], label=nickname_a), # Reset left Chatbot
252
+ gr.update(value=[], label=nickname_b), # Reset right Chatbot
253
+ None,
254
+ None,
255
+ gr.update(interactive=False, value=f"Vote for {nickname_a}"),
256
+ gr.update(interactive=False, value=f"Vote for {nickname_b}"),
257
+ gr.update(value="", visible=False),
258
+ gr.update(),
259
+ gr.update(visible=False),
260
+ gr.update()
261
+ )
262
+
263
+ # Add this new function
264
+ def get_human_readable_name(model_name: str) -> str:
265
+ model_dict = dict(arena_config.APPROVED_MODELS)
266
+ return model_dict.get(model_name, model_name)
267
+
268
+ # Add this new function to randomly select a prompt
269
+ def random_prompt():
270
+ return random.choice(arena_config.example_prompts)
271
+
272
+ # Initialize Gradio Blocks
273
+ with gr.Blocks(css="""
274
+ #dice-button {
275
+ min-height: 90px;
276
+ font-size: 35px;
277
+ }
278
+ """) as demo:
279
+ gr.Markdown(arena_config.ARENA_NAME)
280
+ gr.Markdown(arena_config.ARENA_DESCRIPTION)
281
+
282
+ # Battle Arena Tab
283
+ with gr.Tab("Battle Arena"):
284
+ with gr.Row():
285
+ prompt_input = gr.Textbox(
286
+ label="Enter your prompt",
287
+ placeholder="Type your prompt here...",
288
+ scale=20
289
+ )
290
+ random_prompt_btn = gr.Button("🎲", scale=1, elem_id="dice-button")
291
+
292
+ gr.Markdown("<br>")
293
+
294
+ # Add the random prompt button functionality
295
+ random_prompt_btn.click(
296
+ random_prompt,
297
+ outputs=prompt_input
298
+ )
299
+
300
+ submit_btn = gr.Button("Generate Responses", variant="primary")
301
+
302
+ with gr.Row():
303
+ left_output = gr.Chatbot(label=random.choice(arena_config.model_nicknames), type="messages")
304
+ right_output = gr.Chatbot(label=random.choice(arena_config.model_nicknames), type="messages")
305
+
306
+ with gr.Row():
307
+ left_vote_btn = gr.Button(f"Vote for {left_output.label}", interactive=False)
308
+ right_vote_btn = gr.Button(f"Vote for {right_output.label}", interactive=False)
309
+
310
+ result = gr.Textbox(label="Result", interactive=False, visible=False)
311
+
312
+ with gr.Row(visible=False) as model_names_row:
313
+ left_model = gr.Textbox(label="πŸ”΅ Left Model", interactive=False)
314
+ right_model = gr.Textbox(label="πŸ”΄ Right Model", interactive=False)
315
+
316
+ new_battle_btn = gr.Button("New Battle")
317
+
318
+ # Leaderboard Tab
319
+ with gr.Tab("Leaderboard"):
320
+ leaderboard = gr.HTML(label="Leaderboard")
321
+
322
+ # Performance Chart Tab
323
+ with gr.Tab("Performance Chart"):
324
+ leaderboard_chart = gr.Plot(label="Model Performance Chart")
325
+
326
+ # Define interactions
327
+ submit_btn.click(
328
+ battle_arena,
329
+ inputs=prompt_input,
330
+ outputs=[left_output, right_output, left_model, right_model,
331
+ left_output, right_output, left_vote_btn, right_vote_btn]
332
+ )
333
+
334
+ left_vote_btn.click(
335
+ lambda *args: record_vote(*args, "Left is better"),
336
+ inputs=[prompt_input, left_output, right_output, left_model, right_model],
337
+ outputs=[result, leaderboard, left_vote_btn,
338
+ right_vote_btn, model_names_row, leaderboard_chart]
339
+ )
340
+
341
+ right_vote_btn.click(
342
+ lambda *args: record_vote(*args, "Right is better"),
343
+ inputs=[prompt_input, left_output, right_output, left_model, right_model],
344
+ outputs=[result, leaderboard, left_vote_btn,
345
+ right_vote_btn, model_names_row, leaderboard_chart]
346
+ )
347
+
348
+ new_battle_btn.click(
349
+ new_battle,
350
+ outputs=[prompt_input, left_output, right_output, left_model,
351
+ right_model, left_vote_btn, right_vote_btn,
352
+ result, leaderboard, model_names_row, leaderboard_chart]
353
+ )
354
+
355
+ # Update leaderboard and chart on launch
356
+ demo.load(get_leaderboard, outputs=leaderboard)
357
+ demo.load(get_leaderboard_chart, outputs=leaderboard_chart)
358
+
359
+ if __name__ == "__main__":
360
+ demo.launch()