zhiminy commited on
Commit
264fef1
·
1 Parent(s): 7ad0805

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +102 -92
app.py CHANGED
@@ -39,10 +39,6 @@ with open("context_window.json", "r") as file:
39
  # Get list of available models
40
  available_models = list(context_window.keys())
41
 
42
- # Initialize global variables
43
- models_state = {}
44
- conversation_state = {}
45
-
46
 
47
  def fetch_github_content(url):
48
  """Fetch detailed content from a GitHub URL using PyGithub."""
@@ -188,68 +184,62 @@ def fetch_url_content(url):
188
 
189
 
190
  # Truncate prompt
191
- def truncate_prompt(user_input, model_alias, models, conversation_state):
192
  """
193
  Truncate the conversation history and user input to fit within the model's context window.
194
 
195
  Args:
196
- user_input (str): The latest input from the user.
197
- model_alias (str): Alias for the model being used (e.g., "Model A", "Model B").
198
  models (dict): Dictionary mapping model aliases to their names.
199
  conversation_state (dict): State containing the conversation history for all models.
200
 
201
  Returns:
202
  str: Truncated conversation history and user input.
203
  """
204
- model_name = models[model_alias]
205
- context_length = context_window.get(model_name, 4096)
206
-
207
  # Get the full conversation history for the model
208
- history = conversation_state.get(model_name, [])
209
- full_conversation = [
210
- {"role": msg["role"], "content": msg["content"]} for msg in history
211
- ]
212
- full_conversation.append({"role": "user", "content": user_input})
213
-
214
- # Convert to JSON string for accurate length measurement
215
- json_conversation = json.dumps(full_conversation)
216
 
217
- if len(json_conversation) <= context_length:
218
- # If the full conversation fits, return it as-is
219
- return full_conversation
220
 
221
- # Truncate based on the current round
222
- if not history: # First round, truncate FILO
223
- while len(json.dumps(full_conversation)) > context_length:
224
- full_conversation.pop(0) # Remove from the start
225
- else: # Subsequent rounds, truncate FIFO
226
- while len(json.dumps(full_conversation)) > context_length:
227
- full_conversation.pop(-1) # Remove from the end
 
 
 
 
 
 
 
 
 
 
 
 
 
228
 
229
  return full_conversation
230
 
231
 
232
- def chat_with_models(
233
- user_input, model_alias, models, conversation_state, timeout=TIMEOUT
234
- ):
235
- model_name = models[model_alias]
236
- truncated_input = truncate_prompt(
237
- user_input, model_alias, models, conversation_state
238
- )
239
- conversation_state.setdefault(model_name, []).append(
240
- {"role": "user", "content": user_input}
241
- )
242
-
243
  response_event = threading.Event() # Event to signal response completion
244
  model_response = {"content": None, "error": None}
245
 
246
  def request_model_response():
247
  try:
248
- request_params = {"model": model_name, "messages": truncated_input}
249
  response = openai_client.chat.completions.create(**request_params)
250
  model_response["content"] = response.choices[0].message.content
251
  except Exception as e:
252
- model_response["error"] = f"{model_name} model is not available. Error: {e}"
 
 
253
  finally:
254
  response_event.set() # Signal that the response is completed
255
 
@@ -267,37 +257,40 @@ def chat_with_models(
267
  elif model_response["error"]:
268
  raise Exception(model_response["error"])
269
  else:
 
 
 
270
  # Add the model's response to the conversation state
271
- conversation_state[model_name].append(
272
  {"role": "assistant", "content": model_response["content"]}
273
  )
274
-
275
  # Format the complete conversation history with different colors
276
- formatted_history = format_conversation_history(conversation_state[model_name])
277
-
278
  return formatted_history
279
 
280
 
281
  def format_conversation_history(conversation_history):
282
  """
283
  Format the conversation history with different colors for user and model messages.
284
-
285
  Args:
286
  conversation_history (list): List of conversation messages with role and content.
287
-
288
  Returns:
289
  str: Markdown formatted conversation history.
290
  """
291
  formatted_text = ""
292
-
293
  for message in conversation_history:
294
  if message["role"] == "user":
295
  # Format user messages with blue text
296
  formatted_text += f"<div style='color: #0066cc; background-color: #f0f7ff; padding: 10px; border-radius: 5px; margin-bottom: 10px;'><strong>User:</strong> {message['content']}</div>\n\n"
297
- else: # assistant/model messages
298
- # Format model messages with dark green text
299
  formatted_text += f"<div style='color: #006633; background-color: #f0fff0; padding: 10px; border-radius: 5px; margin-bottom: 10px;'><strong>Model:</strong> {message['content']}</div>\n\n"
300
-
301
  return formatted_text
302
 
303
 
@@ -315,10 +308,10 @@ def save_content_to_hf(feedback_data, repo_name):
315
  now = datetime.now()
316
  quarter = (now.month - 1) // 3 + 1
317
  year_quarter = f"{now.year}_Q{quarter}"
318
- day_hour_minute_second = now.strftime("%d_%H%M%S")
319
 
320
  # Define the path in the repository
321
- filename = f"{year_quarter}/{day_hour_minute_second}.json"
322
 
323
  # Ensure the user is authenticated with HF
324
  token = HfFolder.get_token()
@@ -365,11 +358,9 @@ def load_content_from_hf(repo_name="SE-Arena/votes"):
365
  )
366
  with open(local_path, "r") as f:
367
  data = json.load(f)
368
- if isinstance(data, list):
369
- feedback_data.extend(data)
370
- elif isinstance(data, dict):
371
- feedback_data.append(data)
372
-
373
  return feedback_data
374
 
375
  except:
@@ -381,6 +372,10 @@ def get_leaderboard_data(feedback_entry=None):
381
  feedback_data = load_content_from_hf()
382
  feedback_df = pd.DataFrame(feedback_data)
383
 
 
 
 
 
384
  # Concatenate the new feedback with the existing leaderboard data
385
  if feedback_entry is not None:
386
  feedback_df = pd.concat(
@@ -432,12 +427,12 @@ def get_leaderboard_data(feedback_entry=None):
432
  )
433
 
434
  # Calculate consistency score as a pandas Series aligned with other metrics
435
- is_result = pd.Series(
436
  "N/A", index=elo_result.scores.index
437
  ) # Initialize with zeros using same index
438
 
439
  # Loop through models and update values
440
- for model in is_result.index:
441
  # Filter self-matches for this model
442
  self_matches = feedback_df[
443
  (feedback_df["left"] == model) & (feedback_df["right"] == model)
@@ -446,17 +441,19 @@ def get_leaderboard_data(feedback_entry=None):
446
 
447
  if totals:
448
  # Count non-draw outcomes (wins or losses)
449
- draws = self_matches[self_matches["winner"] == evalica.Winner.Draw].shape[0]
450
- # Store as percentage directly
451
- is_result[model] = round(draws / totals * 100, 2)
 
 
452
 
453
  # Combine all results into a single DataFrame
454
  leaderboard_data = pd.DataFrame(
455
  {
456
  "Model": elo_result.scores.index,
457
  "Elo Score": elo_result.scores.values,
458
- "Consistency Score": is_result.values,
459
- "Average Win Rate": avr_result.scores.values * 100,
460
  "Bradley-Terry Coefficient": bt_result.scores.values,
461
  "Eigenvector Centrality Value": eigen_result.scores.values,
462
  "Newman Modularity Score": newman_result.scores.values,
@@ -507,7 +504,7 @@ with gr.Blocks() as app:
507
  leaderboard_intro = gr.Markdown(
508
  """
509
  # 🏆 FM4SE Leaderboard: Community-Driven Evaluation of Top Foundation Models (FMs) in Software Engineering (SE) Tasks
510
- The SE Arena is an open-source platform designed to evaluate foundation models through human preference, fostering transparency and collaboration. Developed by researchers at [Software Analysis and Intelligence Lab (SAIL)](https://sail.cs.queensu.ca), the platform empowers the community to assess and compare the performance of leading FMs in SE tasks. For technical details, check out our [paper](https://arxiv.org/abs/2502.01860).
511
  """,
512
  elem_classes="leaderboard-intro",
513
  )
@@ -678,9 +675,9 @@ with gr.Blocks() as app:
678
  ],
679
  )
680
 
681
- def guardrail_check_se_relevance(user_prompt):
682
  """
683
- Use gpt-4o-mini to check if the user_prompt is SE-related.
684
  Return True if it is SE-related, otherwise False.
685
  """
686
  # Example instructions for classification — adjust to your needs
@@ -692,7 +689,7 @@ with gr.Blocks() as app:
692
  "Otherwise, respond with 'No'."
693
  ),
694
  }
695
- user_message = {"role": "user", "content": user_prompt}
696
 
697
  try:
698
  # Make the chat completion call
@@ -770,31 +767,37 @@ with gr.Blocks() as app:
770
  gr.update(visible=False),
771
  )
772
 
 
773
  repo_info = fetch_url_content(repo_url)
774
- # Combine repository info (if available) with the user query.
775
  combined_user_input = (
776
- f"Repo-related Information: {repo_info}\n\n{user_input}"
777
  if repo_info
778
  else user_input
779
  )
780
 
781
  # Randomly select two models for the comparison
782
  selected_models = [random.choice(available_models) for _ in range(2)]
783
- models = {"Model A": selected_models[0], "Model B": selected_models[1]}
784
 
785
- # Update the states
 
 
 
 
 
 
 
 
786
  models_state.clear()
787
- models_state.update(models)
788
  conversation_state.clear()
789
- conversation_state.update({name: [] for name in models.values()})
 
 
 
790
 
791
  try:
792
- response_a = chat_with_models(
793
- combined_user_input, "Model A", models_state, conversation_state
794
- )
795
- response_b = chat_with_models(
796
- combined_user_input, "Model B", models_state, conversation_state
797
- )
798
  except TimeoutError as e:
799
  # Handle timeout by resetting components and showing a popup.
800
  return (
@@ -841,6 +844,9 @@ with gr.Blocks() as app:
841
  # Determine the initial state of the multi-round send buttons
842
  model_a_send_state = toggle_submit_button("")
843
  model_b_send_state = toggle_submit_button("")
 
 
 
844
 
845
  # Return the updates for all 18 outputs.
846
  return (
@@ -851,7 +857,7 @@ with gr.Blocks() as app:
851
  # [2] repo_url: re-enable but hide
852
  gr.update(interactive=True, visible=False),
853
  # [3] user_prompt_md: display the user's query
854
- gr.update(value=f"**Your Query:**\n\n{user_input}", visible=True),
855
  # [4] response_a_title: show title for Model A
856
  gr.update(value="### Model A:", visible=True),
857
  # [5] response_b_title: show title for Model B
@@ -1002,9 +1008,8 @@ with gr.Blocks() as app:
1002
  # Handle subsequent rounds
1003
  def handle_model_a_send(user_input, models_state, conversation_state):
1004
  try:
1005
- response = chat_with_models(
1006
- user_input, "Model A", models_state, conversation_state
1007
- )
1008
  # Clear the input box and disable the send button
1009
  return (
1010
  response,
@@ -1042,9 +1047,8 @@ with gr.Blocks() as app:
1042
 
1043
  def handle_model_b_send(user_input, models_state, conversation_state):
1044
  try:
1045
- response = chat_with_models(
1046
- user_input, "Model B", models_state, conversation_state
1047
- )
1048
  # Clear the input box and disable the send button
1049
  return (
1050
  response,
@@ -1114,15 +1118,21 @@ with gr.Blocks() as app:
1114
 
1115
  # Create feedback entry
1116
  feedback_entry = {
1117
- "left": models_state["Model A"],
1118
- "right": models_state["Model B"],
1119
  "winner": winner_model,
1120
- "timestamp": datetime.now().strftime("%Y%m%d_%H%M%S"),
1121
  }
1122
 
1123
  # Save feedback back to the Hugging Face dataset
1124
  save_content_to_hf(feedback_entry, "SE-Arena/votes")
1125
 
 
 
 
 
 
 
 
1126
  # Save conversations back to the Hugging Face dataset
1127
  save_content_to_hf(conversation_state, "SE-Arena/conversations")
1128
 
@@ -1193,7 +1203,7 @@ with gr.Blocks() as app:
1193
 
1194
  - The service is a **research preview**. It only provides limited safety measures and may generate offensive content.
1195
  - It must not be used for any **illegal, harmful, violent, racist, or sexual** purposes.
1196
- - Please do not upload any **private information**.
1197
  - The service collects user dialogue data, including both text and images, and reserves the right to distribute it under a **Creative Commons Attribution (CC-BY)** or a similar license.
1198
  """
1199
  )
 
39
  # Get list of available models
40
  available_models = list(context_window.keys())
41
 
 
 
 
 
42
 
43
  def fetch_github_content(url):
44
  """Fetch detailed content from a GitHub URL using PyGithub."""
 
184
 
185
 
186
  # Truncate prompt
187
+ def truncate_prompt(model_alias, models, conversation_state):
188
  """
189
  Truncate the conversation history and user input to fit within the model's context window.
190
 
191
  Args:
192
+ model_alias (str): Alias for the model being used (i.e., "left", "right").
 
193
  models (dict): Dictionary mapping model aliases to their names.
194
  conversation_state (dict): State containing the conversation history for all models.
195
 
196
  Returns:
197
  str: Truncated conversation history and user input.
198
  """
 
 
 
199
  # Get the full conversation history for the model
200
+ full_conversation = conversation_state[f"{model_alias}_chat"]
 
 
 
 
 
 
 
201
 
202
+ # Get the context length for the model
203
+ context_length = context_window[models[model_alias]]
 
204
 
205
+ # Single loop to handle both FIFO removal and content truncation
206
+ while len(json.dumps(full_conversation)) > context_length:
207
+ # If we have more than one message, remove the oldest (FIFO)
208
+ if len(full_conversation) > 1:
209
+ full_conversation.pop(0)
210
+ # If only one message remains, truncate its content
211
+ else:
212
+ current_length = len(json.dumps(full_conversation))
213
+ # Calculate how many characters we need to remove
214
+ excess = current_length - context_length
215
+ # Add a buffer to ensure we remove enough (accounting for JSON encoding)
216
+ truncation_size = min(excess + 10, len(full_conversation[0]["content"]))
217
+
218
+ if truncation_size <= 0:
219
+ break # Can't truncate further
220
+
221
+ # Truncate the content from the end to fit
222
+ full_conversation[0]["content"] = full_conversation[0]["content"][
223
+ :-truncation_size
224
+ ]
225
 
226
  return full_conversation
227
 
228
 
229
+ def chat_with_models(model_alias, models, conversation_state, timeout=TIMEOUT):
230
+ truncated_input = truncate_prompt(model_alias, models, conversation_state)
 
 
 
 
 
 
 
 
 
231
  response_event = threading.Event() # Event to signal response completion
232
  model_response = {"content": None, "error": None}
233
 
234
  def request_model_response():
235
  try:
236
+ request_params = {"model": models[model_alias], "messages": truncated_input}
237
  response = openai_client.chat.completions.create(**request_params)
238
  model_response["content"] = response.choices[0].message.content
239
  except Exception as e:
240
+ model_response["error"] = (
241
+ f"{models[model_alias]} model is not available. Error: {e}"
242
+ )
243
  finally:
244
  response_event.set() # Signal that the response is completed
245
 
 
257
  elif model_response["error"]:
258
  raise Exception(model_response["error"])
259
  else:
260
+ # Get the full conversation history for the model
261
+ model_key = f"{model_alias}_chat"
262
+
263
  # Add the model's response to the conversation state
264
+ conversation_state[model_key].append(
265
  {"role": "assistant", "content": model_response["content"]}
266
  )
267
+
268
  # Format the complete conversation history with different colors
269
+ formatted_history = format_conversation_history(conversation_state[model_key][1:])
270
+
271
  return formatted_history
272
 
273
 
274
  def format_conversation_history(conversation_history):
275
  """
276
  Format the conversation history with different colors for user and model messages.
277
+
278
  Args:
279
  conversation_history (list): List of conversation messages with role and content.
280
+
281
  Returns:
282
  str: Markdown formatted conversation history.
283
  """
284
  formatted_text = ""
285
+
286
  for message in conversation_history:
287
  if message["role"] == "user":
288
  # Format user messages with blue text
289
  formatted_text += f"<div style='color: #0066cc; background-color: #f0f7ff; padding: 10px; border-radius: 5px; margin-bottom: 10px;'><strong>User:</strong> {message['content']}</div>\n\n"
290
+ else:
291
+ # Format assistant messages with dark green text
292
  formatted_text += f"<div style='color: #006633; background-color: #f0fff0; padding: 10px; border-radius: 5px; margin-bottom: 10px;'><strong>Model:</strong> {message['content']}</div>\n\n"
293
+
294
  return formatted_text
295
 
296
 
 
308
  now = datetime.now()
309
  quarter = (now.month - 1) // 3 + 1
310
  year_quarter = f"{now.year}_Q{quarter}"
311
+ timestamp = now.strftime("%Y%m%d_%H%M%S")
312
 
313
  # Define the path in the repository
314
+ filename = f"{year_quarter}/{timestamp}.json"
315
 
316
  # Ensure the user is authenticated with HF
317
  token = HfFolder.get_token()
 
358
  )
359
  with open(local_path, "r") as f:
360
  data = json.load(f)
361
+ # Add the timestamp to the data
362
+ data["timestamp"] = file.split("/")[-1].split(".")[0]
363
+ feedback_data.append(data)
 
 
364
  return feedback_data
365
 
366
  except:
 
372
  feedback_data = load_content_from_hf()
373
  feedback_df = pd.DataFrame(feedback_data)
374
 
375
+ # Load conversation data from the Hugging Face repository
376
+ conversation_data = load_content_from_hf("SE-Arena/conversations")
377
+ conversation_df = pd.DataFrame(conversation_data)
378
+
379
  # Concatenate the new feedback with the existing leaderboard data
380
  if feedback_entry is not None:
381
  feedback_df = pd.concat(
 
427
  )
428
 
429
  # Calculate consistency score as a pandas Series aligned with other metrics
430
+ cs_result = pd.Series(
431
  "N/A", index=elo_result.scores.index
432
  ) # Initialize with zeros using same index
433
 
434
  # Loop through models and update values
435
+ for model in cs_result.index:
436
  # Filter self-matches for this model
437
  self_matches = feedback_df[
438
  (feedback_df["left"] == model) & (feedback_df["right"] == model)
 
441
 
442
  if totals:
443
  # Count non-draw outcomes (wins or losses)
444
+ cs_result[model] = round(
445
+ self_matches[self_matches["winner"] == evalica.Winner.Draw].shape[0]
446
+ / totals,
447
+ 2,
448
+ )
449
 
450
  # Combine all results into a single DataFrame
451
  leaderboard_data = pd.DataFrame(
452
  {
453
  "Model": elo_result.scores.index,
454
  "Elo Score": elo_result.scores.values,
455
+ "Consistency Score": cs_result.values,
456
+ "Average Win Rate": avr_result.scores.values,
457
  "Bradley-Terry Coefficient": bt_result.scores.values,
458
  "Eigenvector Centrality Value": eigen_result.scores.values,
459
  "Newman Modularity Score": newman_result.scores.values,
 
504
  leaderboard_intro = gr.Markdown(
505
  """
506
  # 🏆 FM4SE Leaderboard: Community-Driven Evaluation of Top Foundation Models (FMs) in Software Engineering (SE) Tasks
507
+ The SE Arena is an open-source platform designed to evaluate foundation models through human preference, fostering transparency and collaboration. This platform aims to empower the SE community to assess and compare the performance of leading FMs in related tasks. For technical details, check out our [paper](https://arxiv.org/abs/2502.01860).
508
  """,
509
  elem_classes="leaderboard-intro",
510
  )
 
675
  ],
676
  )
677
 
678
+ def guardrail_check_se_relevance(user_input):
679
  """
680
+ Use gpt-4o-mini to check if the user input is SE-related.
681
  Return True if it is SE-related, otherwise False.
682
  """
683
  # Example instructions for classification — adjust to your needs
 
689
  "Otherwise, respond with 'No'."
690
  ),
691
  }
692
+ user_message = {"role": "user", "content": user_input}
693
 
694
  try:
695
  # Make the chat completion call
 
767
  gr.update(visible=False),
768
  )
769
 
770
+ # Fetch repository info if a URL is provided
771
  repo_info = fetch_url_content(repo_url)
 
772
  combined_user_input = (
773
+ f"Context: {repo_info}\n\nInquiry: {user_input}"
774
  if repo_info
775
  else user_input
776
  )
777
 
778
  # Randomly select two models for the comparison
779
  selected_models = [random.choice(available_models) for _ in range(2)]
780
+ models = {"left": selected_models[0], "right": selected_models[1]}
781
 
782
+ # Create a copy to avoid modifying the original
783
+ conversations = models.copy()
784
+ conversations.update({
785
+ "url": repo_url,
786
+ "left_chat": [{"role": "user", "content": combined_user_input}],
787
+ "right_chat": [{"role": "user", "content": combined_user_input}]
788
+ })
789
+
790
+ # Clear previous states
791
  models_state.clear()
 
792
  conversation_state.clear()
793
+
794
+ # Update the states
795
+ models_state.update(models)
796
+ conversation_state.update(conversations)
797
 
798
  try:
799
+ response_a = chat_with_models("left", models_state, conversation_state)
800
+ response_b = chat_with_models("right", models_state, conversation_state)
 
 
 
 
801
  except TimeoutError as e:
802
  # Handle timeout by resetting components and showing a popup.
803
  return (
 
844
  # Determine the initial state of the multi-round send buttons
845
  model_a_send_state = toggle_submit_button("")
846
  model_b_send_state = toggle_submit_button("")
847
+ display_content = f"### Your Query:\n\n{user_input}"
848
+ if repo_info:
849
+ display_content += f"\n\n### Repo-related URL:\n\n{repo_url}"
850
 
851
  # Return the updates for all 18 outputs.
852
  return (
 
857
  # [2] repo_url: re-enable but hide
858
  gr.update(interactive=True, visible=False),
859
  # [3] user_prompt_md: display the user's query
860
+ gr.update(value=display_content, visible=True),
861
  # [4] response_a_title: show title for Model A
862
  gr.update(value="### Model A:", visible=True),
863
  # [5] response_b_title: show title for Model B
 
1008
  # Handle subsequent rounds
1009
  def handle_model_a_send(user_input, models_state, conversation_state):
1010
  try:
1011
+ conversation_state["left_chat"].append({"role": "user", "content": user_input})
1012
+ response = chat_with_models("left", models_state, conversation_state)
 
1013
  # Clear the input box and disable the send button
1014
  return (
1015
  response,
 
1047
 
1048
  def handle_model_b_send(user_input, models_state, conversation_state):
1049
  try:
1050
+ conversation_state["right_chat"].append({"role": "user", "content": user_input})
1051
+ response = chat_with_models("right", models_state, conversation_state)
 
1052
  # Clear the input box and disable the send button
1053
  return (
1054
  response,
 
1118
 
1119
  # Create feedback entry
1120
  feedback_entry = {
1121
+ "left": models_state["left"],
1122
+ "right": models_state["right"],
1123
  "winner": winner_model,
 
1124
  }
1125
 
1126
  # Save feedback back to the Hugging Face dataset
1127
  save_content_to_hf(feedback_entry, "SE-Arena/votes")
1128
 
1129
+ conversation_state["right_chat"][0]["content"] = conversation_state[
1130
+ "right_chat"
1131
+ ][0]["content"].split("\n\nInquiry: ")[-1]
1132
+ conversation_state["left_chat"][0]["content"] = conversation_state[
1133
+ "left_chat"
1134
+ ][0]["content"].split("\n\nInquiry: ")[-1]
1135
+
1136
  # Save conversations back to the Hugging Face dataset
1137
  save_content_to_hf(conversation_state, "SE-Arena/conversations")
1138
 
 
1203
 
1204
  - The service is a **research preview**. It only provides limited safety measures and may generate offensive content.
1205
  - It must not be used for any **illegal, harmful, violent, racist, or sexual** purposes.
1206
+ - Please do not upload any **private** information.
1207
  - The service collects user dialogue data, including both text and images, and reserves the right to distribute it under a **Creative Commons Attribution (CC-BY)** or a similar license.
1208
  """
1209
  )