import gradio as gr from functools import lru_cache import random import requests import logging import arena_config import plotly.graph_objects as go from typing import Dict from leaderboard import ( get_current_leaderboard, update_leaderboard, start_backup_thread, get_leaderboard, get_elo_leaderboard, ensure_elo_ratings_initialized ) import sys from internal_stats import get_fun_stats import threading import time from collections import Counter # Initialize logging for errors only logging.basicConfig(level=logging.ERROR) logger = logging.getLogger(__name__) # Start the backup thread start_backup_thread() # Function to get available models (using predefined list) def get_available_models(): return [model[0] for model in arena_config.APPROVED_MODELS] # Function to call Ollama API with caching @lru_cache(maxsize=100) def call_ollama_api(model, prompt): payload = { "model": model, "messages": [{"role": "user", "content": prompt}], } try: response = requests.post( f"{arena_config.API_URL}/v1/chat/completions", headers=arena_config.HEADERS, json=payload, timeout=100 ) response.raise_for_status() data = response.json() return data["choices"][0]["message"]["content"] except requests.exceptions.RequestException as e: logger.error(f"Error calling Ollama API for model {model}: {e}") return f"Error: Unable to get response from the model." # Generate responses using two randomly selected models def get_battle_counts(): leaderboard = get_current_leaderboard() battle_counts = Counter() for model, data in leaderboard.items(): battle_counts[model] = data['wins'] + data['losses'] return battle_counts def generate_responses(prompt): available_models = get_available_models() if len(available_models) < 2: return "Error: Not enough models available", "Error: Not enough models available", None, None battle_counts = get_battle_counts() # Sort models by battle count (ascending) sorted_models = sorted(available_models, key=lambda m: battle_counts.get(m, 0)) # Select the first model (least battles) model_a = sorted_models[0] # For the second model, use weighted random selection weights = [1 / (battle_counts.get(m, 1) + 1) for m in sorted_models[1:]] model_b = random.choices(sorted_models[1:], weights=weights, k=1)[0] model_a_response = call_ollama_api(model_a, prompt) model_b_response = call_ollama_api(model_b, prompt) return model_a_response, model_b_response, model_a, model_b def battle_arena(prompt): response_a, response_b, model_a, model_b = generate_responses(prompt) nickname_a = random.choice(arena_config.model_nicknames) nickname_b = random.choice(arena_config.model_nicknames) # Format responses for gr.Chatbot, including the user's prompt response_a_formatted = [ {"role": "user", "content": prompt}, {"role": "assistant", "content": response_a} ] response_b_formatted = [ {"role": "user", "content": prompt}, {"role": "assistant", "content": response_b} ] if random.choice([True, False]): return ( response_a_formatted, response_b_formatted, model_a, model_b, gr.update(label=nickname_a, value=response_a_formatted), gr.update(label=nickname_b, value=response_b_formatted), gr.update(interactive=True, value=f"Vote for {nickname_a}"), gr.update(interactive=True, value=f"Vote for {nickname_b}"), gr.update(interactive=True, visible=True), # Enable and show Tie button prompt, # Return the original prompt 0 # Initialize tie count ) else: return ( response_b_formatted, response_a_formatted, model_b, model_a, gr.update(label=nickname_a, value=response_b_formatted), gr.update(label=nickname_b, value=response_a_formatted), gr.update(interactive=True, value=f"Vote for {nickname_a}"), gr.update(interactive=True, value=f"Vote for {nickname_b}"), gr.update(interactive=True, visible=True), # Enable and show Tie button prompt, # Return the original prompt 0 # Initialize tie count ) def record_vote(prompt, left_response, right_response, left_model, right_model, choice): # Check if outputs are generated if not left_response or not right_response or not left_model or not right_model: return ( "Please generate responses before voting.", gr.update(), gr.update(interactive=False), gr.update(interactive=False), gr.update(visible=False), gr.update() ) winner = left_model if choice == "Left is better" else right_model loser = right_model if choice == "Left is better" else left_model # Update the leaderboard battle_results = update_leaderboard(winner, loser) result_message = f""" 🎉 Vote recorded! You're awesome! 🌟 🔵 In the left corner: {get_human_readable_name(left_model)} 🔴 In the right corner: {get_human_readable_name(right_model)} 🏆 And the champion you picked is... {get_human_readable_name(winner)}! 🥇 """ return ( gr.update(value=result_message, visible=True), # Show result as Markdown get_leaderboard(), # Update leaderboard get_elo_leaderboard(), # Add this line gr.update(interactive=False), # Disable left vote button gr.update(interactive=False), # Disable right vote button gr.update(interactive=False), # Disable tie button gr.update(visible=True), # Show model names get_leaderboard_chart() # Update leaderboard chart ) def get_leaderboard_chart(): battle_results = get_current_leaderboard() # Calculate scores and sort results for model, results in battle_results.items(): total_battles = results["wins"] + results["losses"] if total_battles > 0: win_rate = results["wins"] / total_battles results["score"] = win_rate * (1 - 1 / (total_battles + 1)) else: results["score"] = 0 sorted_results = sorted( battle_results.items(), key=lambda x: (x[1]["score"], x[1]["wins"] + x[1]["losses"]), reverse=True ) models = [get_human_readable_name(model) for model, _ in sorted_results] wins = [results["wins"] for _, results in sorted_results] losses = [results["losses"] for _, results in sorted_results] scores = [results["score"] for _, results in sorted_results] fig = go.Figure() # Stacked Bar chart for Wins and Losses fig.add_trace(go.Bar( x=models, y=wins, name='Wins', marker_color='#22577a' )) fig.add_trace(go.Bar( x=models, y=losses, name='Losses', marker_color='#38a3a5' )) # Line chart for Scores fig.add_trace(go.Scatter( x=models, y=scores, name='Score', yaxis='y2', line=dict(color='#ff7f0e', width=2) )) # Update layout for full-width, increased height, and secondary y-axis fig.update_layout( title='Model Performance', xaxis_title='Models', yaxis_title='Number of Battles', yaxis2=dict( title='Score', overlaying='y', side='right' ), barmode='stack', height=800, width=1450, autosize=True, legend=dict( orientation='h', yanchor='bottom', y=1.02, xanchor='right', x=1 ) ) chart_data = fig.to_json() return fig def new_battle(): nickname_a = random.choice(arena_config.model_nicknames) nickname_b = random.choice(arena_config.model_nicknames) return ( "", # Reset prompt_input gr.update(value=[], label=nickname_a), # Reset left Chatbot gr.update(value=[], label=nickname_b), # Reset right Chatbot None, None, gr.update(interactive=False, value=f"Vote for {nickname_a}"), gr.update(interactive=False, value=f"Vote for {nickname_b}"), gr.update(interactive=False, visible=False), # Reset Tie button gr.update(value="", visible=False), gr.update(), gr.update(visible=False), gr.update(), 0 # Reset tie_count ) # Add this new function def get_human_readable_name(model_name: str) -> str: model_dict = dict(arena_config.APPROVED_MODELS) return model_dict.get(model_name, model_name) # Add this new function to randomly select a prompt def random_prompt(): return random.choice(arena_config.example_prompts) # Modify the continue_conversation function def continue_conversation(prompt, left_chat, right_chat, left_model, right_model, previous_prompt, tie_count): # Check if the prompt is empty or the same as the previous one if not prompt or prompt == previous_prompt: prompt = random.choice(arena_config.example_prompts) left_response = call_ollama_api(left_model, prompt) right_response = call_ollama_api(right_model, prompt) left_chat.append({"role": "user", "content": prompt}) left_chat.append({"role": "assistant", "content": left_response}) right_chat.append({"role": "user", "content": prompt}) right_chat.append({"role": "assistant", "content": right_response}) tie_count += 1 tie_button_state = gr.update(interactive=True) if tie_count < 3 else gr.update(interactive=False, value="Max ties reached. Please vote!") return ( gr.update(value=left_chat), gr.update(value=right_chat), gr.update(value=""), # Clear the prompt input tie_button_state, prompt, # Return the new prompt tie_count ) def get_fun_stats_html(): stats = get_fun_stats() html = f"""
Total Battles Fought: {stats['total_battles']}
Active Gladiators (Models): {stats['active_models']}
🥇 Battle Veteran: {stats['most_battles']['model']} ({stats['most_battles']['battles']} battles)
🏹 Sharpshooter: {stats['highest_win_rate']['model']} (Win Rate: {stats['highest_win_rate']['win_rate']})
🌈 Jack of All Trades: {stats['most_diverse_opponent']['model']} (Faced {stats['most_diverse_opponent']['unique_opponents']} unique opponents)
🐕 Underdog Champion: {stats['underdog_champion']['model']} ({stats['underdog_champion']['size']} model with {stats['underdog_champion']['win_rate']} win rate)
⚖️ Mr. Consistent: {stats['most_consistent']['model']} (Closest to 50% win rate, difference of {stats['most_consistent']['win_loss_difference']} wins/losses)
🤼 Biggest Rivalry: {stats['biggest_rivalry']['model1']} vs {stats['biggest_rivalry']['model2']} ({stats['biggest_rivalry']['total_battles']} fierce battles!)
🏋️ David vs Goliath: {stats['david_vs_goliath']['david']} (David) vs {stats['david_vs_goliath']['goliath']} (Goliath)
David won {stats['david_vs_goliath']['wins']} times despite being {stats['david_vs_goliath']['size_difference']} smaller!
🔄 Comeback King: {stats['comeback_king']['model']} (Overcame a {stats['comeback_king']['comeback_margin']}-battle deficit)
🏆 Pyrrhic Victor: {stats['pyrrhic_victor']['model']} (Lowest win rate among models with more wins than losses: {stats['pyrrhic_victor']['win_rate']})