|
|
|
|
|
|
|
|
|
import dotenv |
|
import evalica |
|
import gitlab |
|
import io |
|
import json |
|
import os |
|
import random |
|
import re |
|
import threading |
|
|
|
import gradio as gr |
|
import pandas as pd |
|
|
|
from datetime import datetime |
|
from github import Github |
|
from gradio_leaderboard import Leaderboard |
|
from huggingface_hub import upload_file, hf_hub_download, HfFolder, HfApi |
|
from openai import OpenAI |
|
|
|
|
|
dotenv.load_dotenv() |
|
|
|
|
|
api_key = os.getenv("API_KEY") |
|
base_url = "https://api.pandalla.ai/v1" |
|
openai_client = OpenAI(api_key=api_key, base_url=base_url) |
|
|
|
|
|
TIMEOUT = 90 |
|
|
|
|
|
SHOW_HINT_STRING = True |
|
HINT_STRING = "Once signed in, your votes will be recorded securely." |
|
|
|
|
|
with open("context_window.json", "r") as file: |
|
context_window = json.load(file) |
|
|
|
|
|
available_models = list(context_window.keys()) |
|
|
|
|
|
def fetch_github_content(url): |
|
"""Fetch detailed content from a GitHub URL using PyGithub.""" |
|
token = os.getenv("GITHUB_TOKEN") |
|
if not token: |
|
print("GITHUB_TOKEN not set.") |
|
return None |
|
|
|
g = Github(token) |
|
|
|
try: |
|
match = re.match( |
|
r"https?://github\.com/([^/]+)/([^/]+)/(commit|pull|issues|discussions)/([a-z0-9]+)", |
|
url, |
|
) |
|
|
|
if not match: |
|
repo_part = re.match(r"https?://github\.com/([^/]+)/([^/]+)/?", url) |
|
if repo_part: |
|
owner, repo = repo_part.groups() |
|
repo = g.get_repo(f"{owner}/{repo}") |
|
try: |
|
readme = repo.get_readme() |
|
return readme.decoded_content.decode() |
|
except: |
|
return repo.description |
|
return None |
|
|
|
owner, repo, category, identifier = match.groups() |
|
repo = g.get_repo(f"{owner}/{repo}") |
|
|
|
if category == "commit": |
|
commit = repo.get_commit(identifier) |
|
return commit.__dict__ |
|
|
|
elif category in ["pull", "issues"]: |
|
obj = ( |
|
repo.get_pull(int(identifier)) |
|
if category == "pull" |
|
else repo.get_issue(int(identifier)) |
|
) |
|
return obj.__dict__ |
|
|
|
except Exception as e: |
|
print(f"GitHub API error: {e}") |
|
return None |
|
|
|
|
|
def fetch_gitlab_content(url): |
|
"""Fetch content from GitLab URL using python-gitlab.""" |
|
token = os.getenv("GITLAB_TOKEN") |
|
if not token: |
|
print("GITLAB_TOKEN not set.") |
|
return None |
|
gl = gitlab.Gitlab(private_token=token) |
|
|
|
try: |
|
match = re.match( |
|
r"https?://gitlab\.com/([^/]+)/([^/]+)/-/?(commit|merge_requests|issues)/([^/]+)", |
|
url, |
|
) |
|
if not match: |
|
repo_part = re.match(r"https?://gitlab\.com/([^/]+)/([^/]+)/?", url) |
|
if repo_part: |
|
owner, repo = repo_part.groups() |
|
project = gl.projects.get(f"{owner}/{repo}") |
|
try: |
|
readme = project.files.get(file_path="README.md", ref="master") |
|
return readme.decode() |
|
except gitlab.exceptions.GitlabGetError: |
|
return project.description |
|
return None |
|
|
|
owner, repo, category, identifier = match.groups() |
|
project = gl.projects.get(f"{owner}/{repo}") |
|
|
|
if category == "commit": |
|
commit = project.commits.get(identifier) |
|
return commit.__dict__ |
|
|
|
elif category == "merge_requests": |
|
merge_request = project.mergerequests.get(int(identifier)) |
|
return merge_request.__dict__ |
|
|
|
elif category == "issues": |
|
issue = project.issues.get(int(identifier)) |
|
return issue.__dict__ |
|
|
|
except Exception as e: |
|
print(f"GitLab API error: {e}") |
|
return None |
|
|
|
|
|
def fetch_huggingface_content(url): |
|
"""Fetch detailed content from a Hugging Face URL using huggingface_hub API.""" |
|
token = os.getenv("HF_TOKEN") |
|
if not token: |
|
print("HF_TOKEN not set.") |
|
return None |
|
|
|
api = HfApi(token=token) |
|
|
|
try: |
|
if "/commit/" in url: |
|
commit_hash = url.split("/commit/")[-1] |
|
repo_id = url.split("/commit/")[0].split("huggingface.co/")[-1] |
|
commits = api.list_repo_commits(repo_id=repo_id, revision=commit_hash) |
|
if commits: |
|
commit = commits[0] |
|
return commit.__dict__ |
|
return None |
|
|
|
elif "/discussions/" in url: |
|
discussion_num = int(url.split("/discussions/")[-1]) |
|
repo_id = url.split("/discussions/")[0].split("/huggingface.co/")[-1] |
|
discussion = api.get_discussion_details( |
|
repo_id=repo_id, discussion_num=discussion_num |
|
) |
|
return discussion.__dict__ |
|
|
|
else: |
|
repo_id = url.split("huggingface.co/")[-1] |
|
repo_info = api.repo_info(repo_id=repo_id) |
|
return repo_info.__dict__ |
|
|
|
except Exception as e: |
|
print(f"Hugging Face API error: {e}") |
|
return None |
|
|
|
|
|
def fetch_url_content(url): |
|
"""Main URL content fetcher that routes to platform-specific handlers.""" |
|
try: |
|
if "github.com" in url: |
|
return fetch_github_content(url) |
|
elif "gitlab.com" in url: |
|
return fetch_gitlab_content(url) |
|
elif "huggingface.co" in url: |
|
return fetch_huggingface_content(url) |
|
except Exception as e: |
|
print(f"Error fetching URL content: {e}") |
|
return "" |
|
|
|
|
|
|
|
def truncate_prompt(model_alias, models, conversation_state): |
|
""" |
|
Truncate the conversation history and user input to fit within the model's context window. |
|
|
|
Args: |
|
model_alias (str): Alias for the model being used (i.e., "left", "right"). |
|
models (dict): Dictionary mapping model aliases to their names. |
|
conversation_state (dict): State containing the conversation history for all models. |
|
|
|
Returns: |
|
str: Truncated conversation history and user input. |
|
""" |
|
|
|
full_conversation = conversation_state[f"{model_alias}_chat"] |
|
|
|
|
|
context_length = context_window[models[model_alias]] |
|
|
|
|
|
while len(json.dumps(full_conversation)) > context_length: |
|
|
|
if len(full_conversation) > 1: |
|
full_conversation.pop(0) |
|
|
|
else: |
|
current_length = len(json.dumps(full_conversation)) |
|
|
|
excess = current_length - context_length |
|
|
|
truncation_size = min(excess + 10, len(full_conversation[0]["content"])) |
|
|
|
if truncation_size <= 0: |
|
break |
|
|
|
|
|
full_conversation[0]["content"] = full_conversation[0]["content"][ |
|
:-truncation_size |
|
] |
|
|
|
return full_conversation |
|
|
|
|
|
def chat_with_models(model_alias, models, conversation_state, timeout=TIMEOUT): |
|
truncated_input = truncate_prompt(model_alias, models, conversation_state) |
|
response_event = threading.Event() |
|
model_response = {"content": None, "error": None} |
|
|
|
def request_model_response(): |
|
try: |
|
request_params = {"model": models[model_alias], "messages": truncated_input} |
|
response = openai_client.chat.completions.create(**request_params) |
|
model_response["content"] = response.choices[0].message.content |
|
except Exception as e: |
|
model_response["error"] = ( |
|
f"{models[model_alias]} model is not available. Error: {e}" |
|
) |
|
finally: |
|
response_event.set() |
|
|
|
|
|
response_thread = threading.Thread(target=request_model_response) |
|
response_thread.start() |
|
|
|
|
|
response_event_occurred = response_event.wait(timeout) |
|
|
|
if not response_event_occurred: |
|
raise TimeoutError( |
|
f"The {model_alias} model did not respond within {timeout} seconds." |
|
) |
|
elif model_response["error"]: |
|
raise Exception(model_response["error"]) |
|
else: |
|
|
|
model_key = f"{model_alias}_chat" |
|
|
|
|
|
conversation_state[model_key].append( |
|
{"role": "assistant", "content": model_response["content"]} |
|
) |
|
|
|
|
|
formatted_history = format_conversation_history( |
|
conversation_state[model_key][1:] |
|
) |
|
|
|
return formatted_history |
|
|
|
|
|
def format_conversation_history(conversation_history): |
|
""" |
|
Format the conversation history with different colors for user and model messages. |
|
|
|
Args: |
|
conversation_history (list): List of conversation messages with role and content. |
|
|
|
Returns: |
|
str: Markdown formatted conversation history. |
|
""" |
|
formatted_text = "" |
|
|
|
for message in conversation_history: |
|
if message["role"] == "user": |
|
|
|
formatted_text += f"<div style='color: #0066cc; background-color: #f0f7ff; padding: 10px; border-radius: 5px; margin-bottom: 10px;'><strong>User:</strong> {message['content']}</div>\n\n" |
|
else: |
|
|
|
formatted_text += f"<div style='color: #006633; background-color: #f0fff0; padding: 10px; border-radius: 5px; margin-bottom: 10px;'><strong>Model:</strong> {message['content']}</div>\n\n" |
|
|
|
return formatted_text |
|
|
|
|
|
def save_content_to_hf(vote_data, repo_name, folder_name, file_name): |
|
""" |
|
Save feedback content to Hugging Face repository organized by quarter. |
|
""" |
|
|
|
json_content = json.dumps(vote_data, indent=4).encode("utf-8") |
|
|
|
|
|
file_like_object = io.BytesIO(json_content) |
|
|
|
|
|
filename = f"{folder_name}/{file_name}.json" |
|
|
|
|
|
token = HfFolder.get_token() |
|
if token is None: |
|
raise ValueError("Please log in to Hugging Face using `huggingface-cli login`.") |
|
|
|
|
|
upload_file( |
|
path_or_fileobj=file_like_object, |
|
path_in_repo=filename, |
|
repo_id=repo_name, |
|
repo_type="dataset", |
|
use_auth_token=token, |
|
) |
|
|
|
|
|
def load_content_from_hf(repo_name="SE-Arena/votes"): |
|
""" |
|
Read feedback content from a Hugging Face repository based on the current quarter. |
|
|
|
Args: |
|
repo_name (str): Hugging Face repository name. |
|
|
|
Returns: |
|
list: Aggregated feedback data read from the repository. |
|
""" |
|
vote_data = [] |
|
|
|
|
|
now = datetime.now() |
|
quarter = (now.month - 1) // 3 + 1 |
|
year_quarter = f"{now.year}_Q{quarter}" |
|
|
|
try: |
|
api = HfApi() |
|
|
|
for file in api.list_repo_files(repo_id=repo_name, repo_type="dataset"): |
|
|
|
if year_quarter not in file: |
|
continue |
|
|
|
local_path = hf_hub_download( |
|
repo_id=repo_name, filename=file, repo_type="dataset" |
|
) |
|
with open(local_path, "r") as f: |
|
data = json.load(f) |
|
data["timestamp"] = file.split("/")[-1].split(".")[0] |
|
vote_data.append(data) |
|
return vote_data |
|
|
|
except: |
|
raise Exception("Error loading feedback data from Hugging Face repository.") |
|
|
|
|
|
def get_leaderboard_data(vote_entry=None): |
|
|
|
vote_data = load_content_from_hf() |
|
vote_df = pd.DataFrame(vote_data) |
|
|
|
|
|
if vote_entry is not None: |
|
vote_df = pd.concat([vote_df, pd.DataFrame([vote_entry])], ignore_index=True) |
|
|
|
if vote_df.empty: |
|
return pd.DataFrame( |
|
columns=[ |
|
"Rank", |
|
"Model", |
|
"Elo Score", |
|
"Conversation Efficiency Index", |
|
"Model Consistency Score", |
|
"Average Win Rate", |
|
"Average Failure Rate", |
|
"Bradley-Terry Coefficient", |
|
"Eigenvector Centrality Value", |
|
"Newman Modularity Score", |
|
"PageRank Score", |
|
] |
|
) |
|
|
|
|
|
conversation_data = load_content_from_hf("SE-Arena/conversations") |
|
conversation_df = pd.DataFrame(conversation_data) |
|
|
|
|
|
all_df = pd.merge( |
|
vote_df, conversation_df, on=["timestamp", "left", "right"], how="inner" |
|
) |
|
|
|
|
|
model_stats = {} |
|
|
|
|
|
for _, row in all_df.iterrows(): |
|
left_model = row["left"] |
|
right_model = row["right"] |
|
is_self_match = left_model == right_model |
|
|
|
|
|
for model in [left_model, right_model]: |
|
if model not in model_stats: |
|
model_stats[model] = { |
|
"cei_sum": 0, |
|
"cei_max": 0, |
|
"self_matches": 0, |
|
"self_draws": 0, |
|
} |
|
|
|
|
|
if is_self_match: |
|
model_stats[left_model]["self_matches"] += 1 |
|
if row["winner"] == "both_bad" or row["winner"] == "tie": |
|
model_stats[left_model]["self_draws"] += 1 |
|
continue |
|
|
|
|
|
match row["winner"]: |
|
case "left": |
|
left_score = 1 |
|
right_score = -1 |
|
case "right": |
|
left_score = -1 |
|
right_score = 1 |
|
case "tie": |
|
left_score = 0.3 |
|
right_score = 0.3 |
|
case "both_bad": |
|
left_score = -0.3 |
|
right_score = -0.3 |
|
|
|
|
|
left_round = sum(1 for msg in row["left_chat"] if msg["role"] == "assistant") |
|
right_round = sum(1 for msg in row["right_chat"] if msg["role"] == "assistant") |
|
|
|
|
|
model_stats[left_model]["cei_max"] += 1 / left_round |
|
model_stats[right_model]["cei_max"] += 1 / right_round |
|
model_stats[left_model]["cei_sum"] += left_score / left_round |
|
model_stats[right_model]["cei_sum"] += right_score / right_round |
|
|
|
|
|
vote_df["winner"] = vote_df["winner"].map( |
|
{ |
|
"left": evalica.Winner.X, |
|
"right": evalica.Winner.Y, |
|
"tie": evalica.Winner.Draw, |
|
"both_bad": evalica.Winner.Draw, |
|
} |
|
) |
|
|
|
|
|
avr_result = evalica.average_win_rate( |
|
vote_df["left"], |
|
vote_df["right"], |
|
vote_df["winner"], |
|
tie_weight=0, |
|
) |
|
bt_result = evalica.bradley_terry( |
|
vote_df["left"], vote_df["right"], vote_df["winner"], tie_weight=0 |
|
) |
|
newman_result = evalica.newman( |
|
vote_df["left"], vote_df["right"], vote_df["winner"], tie_weight=0 |
|
) |
|
eigen_result = evalica.eigen( |
|
vote_df["left"], vote_df["right"], vote_df["winner"], tie_weight=0 |
|
) |
|
elo_result = evalica.elo( |
|
vote_df["left"], vote_df["right"], vote_df["winner"], tie_weight=0 |
|
) |
|
pagerank_result = evalica.pagerank( |
|
vote_df["left"], vote_df["right"], vote_df["winner"], tie_weight=0 |
|
) |
|
|
|
|
|
cei_result = {} |
|
for model in elo_result.scores.index: |
|
if model in model_stats and model_stats[model]["cei_max"] > 0: |
|
cei_result[model] = round( |
|
model_stats[model]["cei_sum"] / model_stats[model]["cei_max"], 2 |
|
) |
|
else: |
|
cei_result[model] = "N/A" |
|
cei_result = pd.Series(cei_result) |
|
|
|
|
|
mcs_result = {} |
|
for model in elo_result.scores.index: |
|
if model in model_stats and model_stats[model]["self_matches"] > 0: |
|
mcs_result[model] = round( |
|
model_stats[model]["self_draws"] / model_stats[model]["self_matches"], 2 |
|
) |
|
else: |
|
mcs_result[model] = "N/A" |
|
mcs_result = pd.Series(mcs_result) |
|
|
|
|
|
leaderboard_data = pd.DataFrame( |
|
{ |
|
"Model": elo_result.scores.index, |
|
"Elo Score": elo_result.scores.values, |
|
"Conversation Efficiency Index": cei_result.values, |
|
"Model Consistency Score": mcs_result.values, |
|
"Average Win Rate": avr_result.scores.values, |
|
"Bradley-Terry Coefficient": bt_result.scores.values, |
|
"Eigenvector Centrality Value": eigen_result.scores.values, |
|
"Newman Modularity Score": newman_result.scores.values, |
|
"PageRank Score": pagerank_result.scores.values, |
|
} |
|
) |
|
|
|
|
|
leaderboard_data = leaderboard_data.round( |
|
{ |
|
"Elo Score": 2, |
|
"Average Win Rate": 2, |
|
"Bradley-Terry Coefficient": 2, |
|
"Eigenvector Centrality Value": 2, |
|
"Newman Modularity Score": 2, |
|
"PageRank Score": 2, |
|
} |
|
) |
|
|
|
|
|
leaderboard_data["Rank"] = ( |
|
leaderboard_data["Elo Score"].rank(method="min", ascending=False).astype(int) |
|
) |
|
|
|
|
|
leaderboard_data = leaderboard_data[ |
|
["Rank"] + [col for col in leaderboard_data.columns if col != "Rank"] |
|
] |
|
return leaderboard_data |
|
|
|
|
|
|
|
def toggle_submit_button(text): |
|
if not text or text.strip() == "": |
|
return gr.update(interactive=False) |
|
else: |
|
return gr.update(interactive=True) |
|
|
|
|
|
|
|
with gr.Blocks() as app: |
|
user_authenticated = gr.State(False) |
|
models_state = gr.State({}) |
|
conversation_state = gr.State({}) |
|
|
|
with gr.Tab("🏆Leaderboard"): |
|
|
|
leaderboard_intro = gr.Markdown( |
|
""" |
|
# 🏆 FM4SE Leaderboard: Community-Driven Evaluation of Top Foundation Models (FMs) in Software Engineering (SE) Tasks |
|
The SE Arena is an open-source platform designed to evaluate foundation models through human preference, fostering transparency and collaboration. This platform aims to empower the SE community to assess and compare the performance of leading FMs in related tasks. For technical details, check out our [paper](https://arxiv.org/abs/2502.01860). |
|
""", |
|
elem_classes="leaderboard-intro", |
|
) |
|
|
|
leaderboard_component = Leaderboard( |
|
value=get_leaderboard_data(), |
|
select_columns=[ |
|
"Rank", |
|
"Model", |
|
"Elo Score", |
|
"Conversation Efficiency Index", |
|
"Model Consistency Score", |
|
], |
|
search_columns=["Model"], |
|
filter_columns=[ |
|
"Elo Score", |
|
"Conversation Efficiency Index", |
|
"Model Consistency Score", |
|
"Average Win Rate", |
|
"Bradley-Terry Coefficient", |
|
"Eigenvector Centrality Value", |
|
"Newman Modularity Score", |
|
"PageRank Score", |
|
], |
|
) |
|
|
|
citation_component = gr.Markdown( |
|
""" |
|
Made with ❤️ for SE Arena. If this work is useful to you, please consider citing: |
|
``` |
|
@inproceedings{zhao2025se, |
|
title={SE Arena: An Interactive Platform for Evaluating Foundation Models in Software Engineering}, |
|
author={Zhao, Zhimin}, |
|
booktitle={ACM international conference on AI Foundation Models and Software Engineering}, |
|
year={2025}} |
|
``` |
|
""" |
|
) |
|
with gr.Tab("⚔️Arena"): |
|
|
|
arena_intro = gr.Markdown( |
|
f""" |
|
# ⚔️ SE Arena: Explore and Test Top FMs with SE Tasks by Community Voting |
|
|
|
## 📜How It Works |
|
- **Blind Comparison**: Submit a SE-related query to two anonymous FMs randomly selected from up to {len(available_models)} top models from OpenAI, Gemini, Grok, Claude, Deepseek, Qwen, Llama, Mistral, and others. |
|
- **Interactive Voting**: Engage in multi-turn dialogues with both FMs and compare their responses. You can continue the conversation until you confidently choose the better model. |
|
- **Fair Play Rules**: Votes are counted only if FM identities remain anonymous. Revealing a FM's identity disqualifies the session. |
|
|
|
**Note:** Due to budget constraints, responses that take longer than {TIMEOUT} seconds to generate will be discarded. |
|
""", |
|
elem_classes="arena-intro", |
|
) |
|
|
|
with gr.Row(): |
|
|
|
markdown_text = "## Please sign in first to vote!" |
|
if SHOW_HINT_STRING: |
|
markdown_text += f"\n{HINT_STRING}" |
|
hint_markdown = gr.Markdown(markdown_text, elem_classes="markdown-text") |
|
login_button = gr.Button( |
|
"Sign in with Hugging Face", elem_id="oauth-button" |
|
) |
|
|
|
guardrail_message = gr.Markdown("", visible=False, elem_id="guardrail-message") |
|
|
|
|
|
repo_url = gr.Textbox( |
|
show_label=False, |
|
placeholder="Optional: Enter the URL of a repository (GitHub, GitLab, Hugging Face), issue, commit, or pull request.", |
|
lines=1, |
|
interactive=False, |
|
) |
|
|
|
|
|
shared_input = gr.Textbox( |
|
show_label=False, |
|
placeholder="Enter your query for both models here.", |
|
lines=2, |
|
interactive=False, |
|
) |
|
send_first = gr.Button( |
|
"Submit", visible=True, interactive=False |
|
) |
|
|
|
|
|
shared_input.change( |
|
fn=toggle_submit_button, inputs=shared_input, outputs=send_first |
|
) |
|
|
|
user_prompt_md = gr.Markdown(value="", visible=False) |
|
|
|
with gr.Column(): |
|
shared_input |
|
user_prompt_md |
|
|
|
with gr.Row(): |
|
response_a_title = gr.Markdown(value="", visible=False) |
|
response_b_title = gr.Markdown(value="", visible=False) |
|
|
|
with gr.Row(): |
|
response_a = gr.Markdown(label="Response from Model A") |
|
response_b = gr.Markdown(label="Response from Model B") |
|
|
|
|
|
with gr.Row(visible=False) as timeout_popup: |
|
timeout_message = gr.Markdown( |
|
"### Timeout\n\nOne of the models did not respond within 1 minute. Please try again." |
|
) |
|
close_popup_btn = gr.Button("Okay") |
|
|
|
def close_timeout_popup(): |
|
|
|
shared_input_state = gr.update(interactive=True) |
|
send_first_state = toggle_submit_button(shared_input.value) |
|
|
|
model_a_input_state = gr.update(interactive=True) |
|
model_a_send_state = toggle_submit_button(model_a_input.value) |
|
|
|
model_b_input_state = gr.update(interactive=True) |
|
model_b_send_state = toggle_submit_button(model_b_input.value) |
|
|
|
|
|
repo_url_state = gr.update(interactive=True) |
|
|
|
return ( |
|
gr.update(visible=False), |
|
shared_input_state, |
|
send_first_state, |
|
model_a_input_state, |
|
model_a_send_state, |
|
model_b_input_state, |
|
model_b_send_state, |
|
repo_url_state, |
|
) |
|
|
|
|
|
with gr.Row(visible=False) as multi_round_inputs: |
|
model_a_input = gr.Textbox(label="Model A Input", lines=1) |
|
model_a_send = gr.Button( |
|
"Send to Model A", interactive=False |
|
) |
|
|
|
model_b_input = gr.Textbox(label="Model B Input", lines=1) |
|
model_b_send = gr.Button( |
|
"Send to Model B", interactive=False |
|
) |
|
|
|
|
|
model_a_input.change( |
|
fn=toggle_submit_button, inputs=model_a_input, outputs=model_a_send |
|
) |
|
|
|
model_b_input.change( |
|
fn=toggle_submit_button, inputs=model_b_input, outputs=model_b_send |
|
) |
|
|
|
close_popup_btn.click( |
|
close_timeout_popup, |
|
inputs=[], |
|
outputs=[ |
|
timeout_popup, |
|
shared_input, |
|
send_first, |
|
model_a_input, |
|
model_a_send, |
|
model_b_input, |
|
model_b_send, |
|
repo_url, |
|
], |
|
) |
|
|
|
def guardrail_check_se_relevance(user_input): |
|
""" |
|
Use gpt-4o-mini to check if the user input is SE-related. |
|
Return True if it is SE-related, otherwise False. |
|
""" |
|
|
|
system_message = { |
|
"role": "system", |
|
"content": ( |
|
"You are a classifier that decides if a user's question is relevant to software engineering. " |
|
"If the question is about software engineering concepts, tools, processes, or code, respond with 'Yes'. " |
|
"Otherwise, respond with 'No'." |
|
), |
|
} |
|
user_message = {"role": "user", "content": user_input} |
|
|
|
try: |
|
|
|
response = openai_client.chat.completions.create( |
|
model="gpt-4o-mini", messages=[system_message, user_message] |
|
) |
|
classification = response.choices[0].message.content.strip().lower() |
|
|
|
return classification.lower().startswith("yes") |
|
except Exception as e: |
|
print(f"Guardrail check failed: {e}") |
|
|
|
|
|
return True |
|
|
|
def disable_first_submit_ui(): |
|
"""First function to immediately disable UI elements""" |
|
return ( |
|
|
|
gr.update(visible=False), |
|
|
|
gr.update(interactive=False), |
|
|
|
gr.update(interactive=False), |
|
|
|
gr.update(interactive=False, value="Processing..."), |
|
) |
|
|
|
|
|
def update_model_titles_and_responses( |
|
repo_url, user_input, models_state, conversation_state |
|
): |
|
|
|
if not repo_url and not guardrail_check_se_relevance(user_input): |
|
|
|
return ( |
|
|
|
gr.update( |
|
value="### Oops! Try asking something about software engineering. Thanks!", |
|
visible=True, |
|
), |
|
|
|
gr.update(value="", interactive=True, visible=True), |
|
|
|
gr.update(value="", interactive=True, visible=True), |
|
|
|
gr.update(value="", visible=False), |
|
|
|
gr.update(value="", visible=False), |
|
|
|
gr.update(value="", visible=False), |
|
|
|
gr.update(value=""), |
|
|
|
gr.update(value=""), |
|
|
|
gr.update(visible=False), |
|
|
|
gr.update(visible=False), |
|
|
|
gr.update(visible=True, interactive=True, value="Submit"), |
|
|
|
gr.update(interactive=True), |
|
|
|
models_state, |
|
|
|
conversation_state, |
|
|
|
gr.update(visible=False), |
|
|
|
gr.update(interactive=False), |
|
|
|
gr.update(interactive=False), |
|
|
|
gr.update(visible=False), |
|
) |
|
|
|
|
|
repo_info = fetch_url_content(repo_url) |
|
combined_user_input = ( |
|
f"Context: {repo_info}\n\nInquiry: {user_input}" |
|
if repo_info |
|
else user_input |
|
) |
|
|
|
|
|
selected_models = [random.choice(available_models) for _ in range(2)] |
|
models = {"left": selected_models[0], "right": selected_models[1]} |
|
|
|
|
|
conversations = models.copy() |
|
conversations.update( |
|
{ |
|
"url": repo_url, |
|
"left_chat": [{"role": "user", "content": combined_user_input}], |
|
"right_chat": [{"role": "user", "content": combined_user_input}], |
|
} |
|
) |
|
|
|
|
|
models_state.clear() |
|
conversation_state.clear() |
|
|
|
|
|
models_state.update(models) |
|
conversation_state.update(conversations) |
|
|
|
try: |
|
response_a = chat_with_models("left", models_state, conversation_state) |
|
response_b = chat_with_models("right", models_state, conversation_state) |
|
except TimeoutError as e: |
|
|
|
return ( |
|
|
|
gr.update(visible=False), |
|
|
|
gr.update(value="", interactive=True, visible=True), |
|
|
|
gr.update(value="", interactive=True, visible=True), |
|
|
|
gr.update(value="", visible=False), |
|
|
|
gr.update(value="", visible=False), |
|
|
|
gr.update(value="", visible=False), |
|
|
|
gr.update(value=""), |
|
|
|
gr.update(value=""), |
|
|
|
gr.update(visible=False), |
|
|
|
gr.update(visible=False), |
|
|
|
gr.update(visible=True, interactive=True, value="Submit"), |
|
|
|
gr.update(interactive=False), |
|
|
|
models_state, |
|
|
|
conversation_state, |
|
|
|
gr.update(visible=True), |
|
|
|
gr.update(interactive=False), |
|
|
|
gr.update(interactive=False), |
|
|
|
gr.update(visible=False), |
|
) |
|
except Exception as e: |
|
raise gr.Error(str(e)) |
|
|
|
|
|
model_a_send_state = toggle_submit_button("") |
|
model_b_send_state = toggle_submit_button("") |
|
display_content = f"### Your Query:\n\n{user_input}" |
|
if repo_info: |
|
display_content += f"\n\n### Repo-related URL:\n\n{repo_url}" |
|
|
|
|
|
return ( |
|
|
|
gr.update(visible=False), |
|
|
|
gr.update(interactive=True, visible=False), |
|
|
|
gr.update(interactive=True, visible=False), |
|
|
|
gr.update(value=display_content, visible=True), |
|
|
|
gr.update(value="### Model A:", visible=True), |
|
|
|
gr.update(value="### Model B:", visible=True), |
|
|
|
gr.update(value=response_a), |
|
|
|
gr.update(value=response_b), |
|
|
|
gr.update(visible=True), |
|
|
|
gr.update(visible=True), |
|
|
|
gr.update(visible=False, value="Submit"), |
|
|
|
gr.update(interactive=True), |
|
|
|
models_state, |
|
|
|
conversation_state, |
|
|
|
gr.update(visible=False), |
|
|
|
model_a_send_state, |
|
|
|
model_b_send_state, |
|
|
|
gr.update(visible=False), |
|
) |
|
|
|
|
|
with gr.Row(visible=False) as vote_panel: |
|
feedback = gr.Radio( |
|
choices=["Model A", "Model B", "Tie", "Tie (Both Bad)"], |
|
label="Which model do you prefer?", |
|
value="Tie", |
|
interactive=False, |
|
) |
|
submit_feedback_btn = gr.Button("Submit Feedback", interactive=False) |
|
|
|
thanks_message = gr.Markdown( |
|
value="## Thanks for your vote!", visible=False |
|
) |
|
|
|
def hide_thanks_message(): |
|
return gr.update(visible=False) |
|
|
|
|
|
def handle_login(): |
|
""" |
|
Handle user login using Hugging Face OAuth with automatic redirection. |
|
""" |
|
try: |
|
|
|
HfApi() |
|
token = HfFolder.get_token() |
|
if not token: |
|
raise Exception("Authentication token not found.") |
|
|
|
|
|
return ( |
|
gr.update(visible=False), |
|
gr.update(interactive=True), |
|
gr.update(interactive=True), |
|
gr.update( |
|
interactive=False |
|
), |
|
gr.update(interactive=True), |
|
gr.update(interactive=True), |
|
gr.update(visible=False), |
|
) |
|
except Exception as e: |
|
|
|
print(f"Login failed: {e}") |
|
return ( |
|
gr.update(visible=True), |
|
gr.update(interactive=False), |
|
gr.update(interactive=False), |
|
gr.update(interactive=False), |
|
gr.update( |
|
interactive=False |
|
), |
|
gr.update(interactive=False), |
|
gr.update(visible=True), |
|
) |
|
|
|
|
|
login_button.click( |
|
handle_login, |
|
inputs=[], |
|
outputs=[ |
|
login_button, |
|
repo_url, |
|
shared_input, |
|
send_first, |
|
feedback, |
|
submit_feedback_btn, |
|
hint_markdown, |
|
], |
|
) |
|
|
|
|
|
send_first.click( |
|
fn=hide_thanks_message, inputs=[], outputs=[thanks_message] |
|
).then( |
|
fn=disable_first_submit_ui, |
|
inputs=[], |
|
outputs=[ |
|
guardrail_message, |
|
shared_input, |
|
repo_url, |
|
send_first, |
|
], |
|
).then( |
|
fn=update_model_titles_and_responses, |
|
inputs=[repo_url, shared_input, models_state, conversation_state], |
|
outputs=[ |
|
guardrail_message, |
|
shared_input, |
|
repo_url, |
|
user_prompt_md, |
|
response_a_title, |
|
response_b_title, |
|
response_a, |
|
response_b, |
|
multi_round_inputs, |
|
vote_panel, |
|
send_first, |
|
feedback, |
|
models_state, |
|
conversation_state, |
|
timeout_popup, |
|
model_a_send, |
|
model_b_send, |
|
thanks_message, |
|
], |
|
) |
|
|
|
def disable_model_a_ui(): |
|
"""First function to immediately disable model A UI elements""" |
|
return ( |
|
|
|
gr.update(interactive=False), |
|
|
|
gr.update(interactive=False, value="Processing..."), |
|
) |
|
|
|
|
|
def handle_model_a_send(user_input, models_state, conversation_state): |
|
try: |
|
conversation_state["left_chat"].append( |
|
{"role": "user", "content": user_input} |
|
) |
|
response = chat_with_models("left", models_state, conversation_state) |
|
|
|
return ( |
|
response, |
|
conversation_state, |
|
gr.update(visible=False), |
|
gr.update( |
|
value="", interactive=True |
|
), |
|
gr.update( |
|
interactive=False, value="Send to Model A" |
|
), |
|
) |
|
except TimeoutError as e: |
|
|
|
return ( |
|
gr.update(value=""), |
|
conversation_state, |
|
gr.update(visible=True), |
|
gr.update(interactive=True), |
|
gr.update( |
|
interactive=True, value="Send to Model A" |
|
), |
|
) |
|
except Exception as e: |
|
raise gr.Error(str(e)) |
|
|
|
def disable_model_b_ui(): |
|
"""First function to immediately disable model B UI elements""" |
|
return ( |
|
|
|
gr.update(interactive=False), |
|
|
|
gr.update(interactive=False, value="Processing..."), |
|
) |
|
|
|
def handle_model_b_send(user_input, models_state, conversation_state): |
|
try: |
|
conversation_state["right_chat"].append( |
|
{"role": "user", "content": user_input} |
|
) |
|
response = chat_with_models("right", models_state, conversation_state) |
|
|
|
return ( |
|
response, |
|
conversation_state, |
|
gr.update(visible=False), |
|
gr.update( |
|
value="", interactive=True |
|
), |
|
gr.update( |
|
interactive=False, value="Send to Model B" |
|
), |
|
) |
|
except TimeoutError as e: |
|
|
|
return ( |
|
gr.update(value=""), |
|
conversation_state, |
|
gr.update(visible=True), |
|
gr.update(interactive=True), |
|
gr.update( |
|
interactive=True, value="Send to Model B" |
|
), |
|
) |
|
except Exception as e: |
|
raise gr.Error(str(e)) |
|
|
|
model_a_send.click( |
|
fn=disable_model_a_ui, |
|
inputs=[], |
|
outputs=[model_a_input, model_a_send], |
|
).then( |
|
fn=handle_model_a_send, |
|
inputs=[model_a_input, models_state, conversation_state], |
|
outputs=[ |
|
response_a, |
|
conversation_state, |
|
timeout_popup, |
|
model_a_input, |
|
model_a_send, |
|
], |
|
) |
|
model_b_send.click( |
|
fn=disable_model_b_ui, |
|
inputs=[], |
|
outputs=[model_b_input, model_b_send], |
|
).then( |
|
fn=handle_model_b_send, |
|
inputs=[model_b_input, models_state, conversation_state], |
|
outputs=[ |
|
response_b, |
|
conversation_state, |
|
timeout_popup, |
|
model_b_input, |
|
model_b_send, |
|
], |
|
) |
|
|
|
def submit_feedback(vote, models_state, conversation_state): |
|
|
|
match vote: |
|
case "Model A": |
|
winner_model = "left" |
|
case "Model B": |
|
winner_model = "right" |
|
case "Tie": |
|
winner_model = "tie" |
|
case _: |
|
winner_model = "both_bad" |
|
|
|
|
|
vote_entry = { |
|
"left": models_state["left"], |
|
"right": models_state["right"], |
|
"winner": winner_model, |
|
} |
|
|
|
|
|
now = datetime.now() |
|
quarter = (now.month - 1) // 3 + 1 |
|
folder_name = f"{now.year}_Q{quarter}" |
|
file_name = now.strftime("%Y%m%d_%H%M%S") |
|
|
|
|
|
save_content_to_hf(vote_entry, "SE-Arena/votes", folder_name, file_name) |
|
|
|
conversation_state["right_chat"][0]["content"] = conversation_state[ |
|
"right_chat" |
|
][0]["content"].split("\n\nInquiry: ")[-1] |
|
conversation_state["left_chat"][0]["content"] = conversation_state[ |
|
"left_chat" |
|
][0]["content"].split("\n\nInquiry: ")[-1] |
|
|
|
|
|
save_content_to_hf( |
|
conversation_state, "SE-Arena/conversations", folder_name, file_name |
|
) |
|
|
|
|
|
models_state.clear() |
|
conversation_state.clear() |
|
|
|
|
|
return ( |
|
gr.update( |
|
value="", interactive=True, visible=True |
|
), |
|
gr.update( |
|
value="", interactive=True, visible=True |
|
), |
|
gr.update( |
|
value="", visible=False |
|
), |
|
gr.update( |
|
value="", visible=False |
|
), |
|
gr.update( |
|
value="", visible=False |
|
), |
|
gr.update(value=""), |
|
gr.update(value=""), |
|
gr.update(visible=False), |
|
gr.update(visible=False), |
|
gr.update( |
|
value="Submit", interactive=True, visible=True |
|
), |
|
gr.update( |
|
value="Tie", interactive=True |
|
), |
|
get_leaderboard_data(vote_entry), |
|
gr.update( |
|
visible=True |
|
), |
|
) |
|
|
|
|
|
submit_feedback_btn.click( |
|
submit_feedback, |
|
inputs=[feedback, models_state, conversation_state], |
|
outputs=[ |
|
shared_input, |
|
repo_url, |
|
user_prompt_md, |
|
response_a_title, |
|
response_b_title, |
|
response_a, |
|
response_b, |
|
multi_round_inputs, |
|
vote_panel, |
|
send_first, |
|
feedback, |
|
leaderboard_component, |
|
thanks_message, |
|
], |
|
) |
|
|
|
|
|
terms_of_service = gr.Markdown( |
|
""" |
|
## Terms of Service |
|
|
|
Users are required to agree to the following terms before using the service: |
|
|
|
- The service is a **research preview**. It only provides limited safety measures and may generate offensive content. |
|
- It must not be used for any **illegal, harmful, violent, racist, or sexual** purposes. |
|
- Please do not upload any **private** information. |
|
- The service collects user dialogue data, including both text and images, and reserves the right to distribute it under a **Creative Commons Attribution (CC-BY)** or a similar license. |
|
""" |
|
) |
|
|
|
app.launch() |
|
|