remove an obsolete model
Browse files- app.py +121 -54
- context_window.json +0 -1
app.py
CHANGED
@@ -403,19 +403,20 @@ def get_leaderboard_data(feedback_entry=None):
|
|
403 |
pagerank_result = evalica.pagerank(
|
404 |
feedback_df["left"], feedback_df["right"], feedback_df["winner"]
|
405 |
)
|
406 |
-
|
407 |
# Calculate consistency score as a pandas Series aligned with other metrics
|
408 |
-
is_result = pd.Series(
|
|
|
|
|
409 |
|
410 |
# Loop through models and update values
|
411 |
for model in is_result.index:
|
412 |
# Filter self-matches for this model
|
413 |
self_matches = feedback_df[
|
414 |
-
(feedback_df["left"] == model) &
|
415 |
-
(feedback_df["right"] == model)
|
416 |
]
|
417 |
totals = len(self_matches)
|
418 |
-
|
419 |
if totals:
|
420 |
# Count non-draw outcomes (wins or losses)
|
421 |
draws = self_matches[self_matches["winner"] == evalica.Winner.Draw].shape[0]
|
@@ -681,23 +682,36 @@ with gr.Blocks() as app:
|
|
681 |
# Here we default to fail open, but you can change as needed.
|
682 |
return True
|
683 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
684 |
# Function to update model titles and responses
|
685 |
def update_model_titles_and_responses(
|
686 |
repo_url, user_input, models_state, conversation_state
|
687 |
):
|
688 |
# Guardrail check first
|
689 |
if not repo_url and not guardrail_check_se_relevance(user_input):
|
690 |
-
# Return updates to show the guardrail message and
|
691 |
return (
|
692 |
# [0] guardrail_message: Show guardrail message
|
693 |
gr.update(
|
694 |
value="### Oops! Try asking something about software engineering. Thanks!",
|
695 |
visible=True,
|
696 |
),
|
697 |
-
# [1] shared_input: clear and
|
698 |
-
gr.update(value="", visible=True),
|
699 |
-
# [2] repo_url: clear and
|
700 |
-
gr.update(value="", visible=True),
|
701 |
# [3] user_prompt_md: clear and hide
|
702 |
gr.update(value="", visible=False),
|
703 |
# [4] response_a_title: clear and hide
|
@@ -712,8 +726,8 @@ with gr.Blocks() as app:
|
|
712 |
gr.update(visible=False),
|
713 |
# [9] vote_panel: hide
|
714 |
gr.update(visible=False),
|
715 |
-
# [10] send_first:
|
716 |
-
gr.update(visible=True, interactive=True),
|
717 |
# [11] feedback: enable the selection
|
718 |
gr.update(interactive=True),
|
719 |
# [12] models_state: pass state as-is
|
@@ -760,10 +774,10 @@ with gr.Blocks() as app:
|
|
760 |
return (
|
761 |
# [0] guardrail_message: hide
|
762 |
gr.update(visible=False),
|
763 |
-
# [1] shared_input:
|
764 |
-
gr.update(value="", interactive=
|
765 |
-
# [2] repo_url:
|
766 |
-
gr.update(value="", interactive=
|
767 |
# [3] user_prompt_md: hide
|
768 |
gr.update(value="", visible=False),
|
769 |
# [4] response_a_title: hide
|
@@ -778,8 +792,8 @@ with gr.Blocks() as app:
|
|
778 |
gr.update(visible=False),
|
779 |
# [9] vote_panel: hide
|
780 |
gr.update(visible=False),
|
781 |
-
# [10] send_first:
|
782 |
-
gr.update(visible=True, interactive=
|
783 |
# [11] feedback: disable
|
784 |
gr.update(interactive=False),
|
785 |
# [12] models_state: pass state as-is
|
@@ -806,10 +820,10 @@ with gr.Blocks() as app:
|
|
806 |
return (
|
807 |
# [0] guardrail_message: hide (since no guardrail issue)
|
808 |
gr.update(visible=False),
|
809 |
-
# [1] shared_input:
|
810 |
-
gr.update(visible=False),
|
811 |
-
# [2] repo_url:
|
812 |
-
gr.update(visible=False),
|
813 |
# [3] user_prompt_md: display the user's query
|
814 |
gr.update(value=f"**Your Query:**\n\n{user_input}", visible=True),
|
815 |
# [4] response_a_title: show title for Model A
|
@@ -824,8 +838,8 @@ with gr.Blocks() as app:
|
|
824 |
gr.update(visible=True),
|
825 |
# [9] vote_panel: show vote panel
|
826 |
gr.update(visible=True),
|
827 |
-
# [10] send_first: hide the submit button
|
828 |
-
gr.update(visible=False),
|
829 |
# [11] feedback: enable the feedback selection
|
830 |
gr.update(interactive=True),
|
831 |
# [12] models_state: pass updated models_state
|
@@ -915,9 +929,20 @@ with gr.Blocks() as app:
|
|
915 |
|
916 |
# First round handling
|
917 |
send_first.click(
|
918 |
-
fn=hide_thanks_message,
|
|
|
|
|
919 |
).then(
|
920 |
-
fn=
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
921 |
inputs=[repo_url, shared_input, models_state, conversation_state],
|
922 |
outputs=[
|
923 |
guardrail_message,
|
@@ -941,6 +966,15 @@ with gr.Blocks() as app:
|
|
941 |
],
|
942 |
)
|
943 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
944 |
# Handle subsequent rounds
|
945 |
def handle_model_a_send(user_input, models_state, conversation_state):
|
946 |
try:
|
@@ -952,10 +986,8 @@ with gr.Blocks() as app:
|
|
952 |
response,
|
953 |
conversation_state,
|
954 |
gr.update(visible=False),
|
955 |
-
gr.update(
|
956 |
-
|
957 |
-
), # Clear and enable model_a_input
|
958 |
-
gr.update(interactive=False), # Disable model_a_send button
|
959 |
)
|
960 |
except TimeoutError as e:
|
961 |
# Disable inputs when timeout occurs
|
@@ -963,12 +995,19 @@ with gr.Blocks() as app:
|
|
963 |
gr.update(value=""), # Clear response
|
964 |
conversation_state,
|
965 |
gr.update(visible=True), # Show the timeout popup
|
966 |
-
gr.update(interactive=
|
967 |
-
gr.update(interactive=
|
968 |
)
|
969 |
except Exception as e:
|
970 |
raise gr.Error(str(e))
|
971 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
972 |
def handle_model_b_send(user_input, models_state, conversation_state):
|
973 |
try:
|
974 |
response = chat_with_models(
|
@@ -979,10 +1018,8 @@ with gr.Blocks() as app:
|
|
979 |
response,
|
980 |
conversation_state,
|
981 |
gr.update(visible=False),
|
982 |
-
gr.update(
|
983 |
-
|
984 |
-
), # Clear and enable model_b_input
|
985 |
-
gr.update(interactive=False), # Disable model_b_send button
|
986 |
)
|
987 |
except TimeoutError as e:
|
988 |
# Disable inputs when timeout occurs
|
@@ -990,14 +1027,21 @@ with gr.Blocks() as app:
|
|
990 |
gr.update(value=""), # Clear response
|
991 |
conversation_state,
|
992 |
gr.update(visible=True), # Show the timeout popup
|
993 |
-
gr.update(interactive=
|
994 |
-
gr.update(interactive=
|
995 |
)
|
996 |
except Exception as e:
|
997 |
raise gr.Error(str(e))
|
998 |
|
999 |
model_a_send.click(
|
1000 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1001 |
inputs=[model_a_input, models_state, conversation_state],
|
1002 |
outputs=[
|
1003 |
response_a,
|
@@ -1008,7 +1052,14 @@ with gr.Blocks() as app:
|
|
1008 |
],
|
1009 |
)
|
1010 |
model_b_send.click(
|
1011 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1012 |
inputs=[model_b_input, models_state, conversation_state],
|
1013 |
outputs=[
|
1014 |
response_b,
|
@@ -1050,19 +1101,35 @@ with gr.Blocks() as app:
|
|
1050 |
|
1051 |
# Adjust output count to match the interface definition
|
1052 |
return (
|
1053 |
-
gr.update(
|
1054 |
-
|
1055 |
-
|
1056 |
-
gr.update(
|
1057 |
-
|
1058 |
-
|
1059 |
-
gr.update(
|
1060 |
-
|
1061 |
-
|
1062 |
-
gr.update(
|
1063 |
-
|
1064 |
-
|
1065 |
-
gr.update(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1066 |
)
|
1067 |
|
1068 |
# Update the click event for the submit feedback button
|
|
|
403 |
pagerank_result = evalica.pagerank(
|
404 |
feedback_df["left"], feedback_df["right"], feedback_df["winner"]
|
405 |
)
|
406 |
+
|
407 |
# Calculate consistency score as a pandas Series aligned with other metrics
|
408 |
+
is_result = pd.Series(
|
409 |
+
"N/A", index=elo_result.scores.index
|
410 |
+
) # Initialize with zeros using same index
|
411 |
|
412 |
# Loop through models and update values
|
413 |
for model in is_result.index:
|
414 |
# Filter self-matches for this model
|
415 |
self_matches = feedback_df[
|
416 |
+
(feedback_df["left"] == model) & (feedback_df["right"] == model)
|
|
|
417 |
]
|
418 |
totals = len(self_matches)
|
419 |
+
|
420 |
if totals:
|
421 |
# Count non-draw outcomes (wins or losses)
|
422 |
draws = self_matches[self_matches["winner"] == evalica.Winner.Draw].shape[0]
|
|
|
682 |
# Here we default to fail open, but you can change as needed.
|
683 |
return True
|
684 |
|
685 |
+
def disable_first_submit_ui():
|
686 |
+
"""First function to immediately disable UI elements"""
|
687 |
+
return (
|
688 |
+
# [0] guardrail_message: hide
|
689 |
+
gr.update(visible=False),
|
690 |
+
# [1] shared_input: disable but keep visible
|
691 |
+
gr.update(interactive=False),
|
692 |
+
# [2] repo_url: disable but keep visible
|
693 |
+
gr.update(interactive=False),
|
694 |
+
# [3] send_first: disable and show loading state
|
695 |
+
gr.update(interactive=False, value="Processing..."),
|
696 |
+
)
|
697 |
+
|
698 |
# Function to update model titles and responses
|
699 |
def update_model_titles_and_responses(
|
700 |
repo_url, user_input, models_state, conversation_state
|
701 |
):
|
702 |
# Guardrail check first
|
703 |
if not repo_url and not guardrail_check_se_relevance(user_input):
|
704 |
+
# Return updates to show the guardrail message and re-enable UI
|
705 |
return (
|
706 |
# [0] guardrail_message: Show guardrail message
|
707 |
gr.update(
|
708 |
value="### Oops! Try asking something about software engineering. Thanks!",
|
709 |
visible=True,
|
710 |
),
|
711 |
+
# [1] shared_input: clear and re-enable
|
712 |
+
gr.update(value="", interactive=True, visible=True),
|
713 |
+
# [2] repo_url: clear and re-enable
|
714 |
+
gr.update(value="", interactive=True, visible=True),
|
715 |
# [3] user_prompt_md: clear and hide
|
716 |
gr.update(value="", visible=False),
|
717 |
# [4] response_a_title: clear and hide
|
|
|
726 |
gr.update(visible=False),
|
727 |
# [9] vote_panel: hide
|
728 |
gr.update(visible=False),
|
729 |
+
# [10] send_first: re-enable button with original text
|
730 |
+
gr.update(visible=True, interactive=True, value="Submit"),
|
731 |
# [11] feedback: enable the selection
|
732 |
gr.update(interactive=True),
|
733 |
# [12] models_state: pass state as-is
|
|
|
774 |
return (
|
775 |
# [0] guardrail_message: hide
|
776 |
gr.update(visible=False),
|
777 |
+
# [1] shared_input: re-enable and clear
|
778 |
+
gr.update(value="", interactive=True, visible=True),
|
779 |
+
# [2] repo_url: re-enable and clear
|
780 |
+
gr.update(value="", interactive=True, visible=True),
|
781 |
# [3] user_prompt_md: hide
|
782 |
gr.update(value="", visible=False),
|
783 |
# [4] response_a_title: hide
|
|
|
792 |
gr.update(visible=False),
|
793 |
# [9] vote_panel: hide
|
794 |
gr.update(visible=False),
|
795 |
+
# [10] send_first: re-enable with original text
|
796 |
+
gr.update(visible=True, interactive=True, value="Submit"),
|
797 |
# [11] feedback: disable
|
798 |
gr.update(interactive=False),
|
799 |
# [12] models_state: pass state as-is
|
|
|
820 |
return (
|
821 |
# [0] guardrail_message: hide (since no guardrail issue)
|
822 |
gr.update(visible=False),
|
823 |
+
# [1] shared_input: re-enable but hide
|
824 |
+
gr.update(interactive=True, visible=False),
|
825 |
+
# [2] repo_url: re-enable but hide
|
826 |
+
gr.update(interactive=True, visible=False),
|
827 |
# [3] user_prompt_md: display the user's query
|
828 |
gr.update(value=f"**Your Query:**\n\n{user_input}", visible=True),
|
829 |
# [4] response_a_title: show title for Model A
|
|
|
838 |
gr.update(visible=True),
|
839 |
# [9] vote_panel: show vote panel
|
840 |
gr.update(visible=True),
|
841 |
+
# [10] send_first: hide the submit button but restore label
|
842 |
+
gr.update(visible=False, value="Submit"),
|
843 |
# [11] feedback: enable the feedback selection
|
844 |
gr.update(interactive=True),
|
845 |
# [12] models_state: pass updated models_state
|
|
|
929 |
|
930 |
# First round handling
|
931 |
send_first.click(
|
932 |
+
fn=hide_thanks_message,
|
933 |
+
inputs=[],
|
934 |
+
outputs=[thanks_message]
|
935 |
).then(
|
936 |
+
fn=disable_first_submit_ui, # First disable UI
|
937 |
+
inputs=[],
|
938 |
+
outputs=[
|
939 |
+
guardrail_message,
|
940 |
+
shared_input,
|
941 |
+
repo_url,
|
942 |
+
send_first # Just the essential UI elements to update immediately
|
943 |
+
]
|
944 |
+
).then(
|
945 |
+
fn=update_model_titles_and_responses, # Then do the actual processing
|
946 |
inputs=[repo_url, shared_input, models_state, conversation_state],
|
947 |
outputs=[
|
948 |
guardrail_message,
|
|
|
966 |
],
|
967 |
)
|
968 |
|
969 |
+
def disable_model_a_ui():
|
970 |
+
"""First function to immediately disable model A UI elements"""
|
971 |
+
return (
|
972 |
+
# [0] model_a_input: disable
|
973 |
+
gr.update(interactive=False),
|
974 |
+
# [1] model_a_send: disable and show loading state
|
975 |
+
gr.update(interactive=False, value="Processing...")
|
976 |
+
)
|
977 |
+
|
978 |
# Handle subsequent rounds
|
979 |
def handle_model_a_send(user_input, models_state, conversation_state):
|
980 |
try:
|
|
|
986 |
response,
|
987 |
conversation_state,
|
988 |
gr.update(visible=False),
|
989 |
+
gr.update(value="", interactive=True), # Clear and enable model_a_input
|
990 |
+
gr.update(interactive=False, value="Send to Model A"), # Reset button text
|
|
|
|
|
991 |
)
|
992 |
except TimeoutError as e:
|
993 |
# Disable inputs when timeout occurs
|
|
|
995 |
gr.update(value=""), # Clear response
|
996 |
conversation_state,
|
997 |
gr.update(visible=True), # Show the timeout popup
|
998 |
+
gr.update(interactive=True), # Re-enable model_a_input
|
999 |
+
gr.update(interactive=True, value="Send to Model A"), # Re-enable model_a_send button
|
1000 |
)
|
1001 |
except Exception as e:
|
1002 |
raise gr.Error(str(e))
|
1003 |
+
def disable_model_b_ui():
|
1004 |
+
"""First function to immediately disable model B UI elements"""
|
1005 |
+
return (
|
1006 |
+
# [0] model_b_input: disable
|
1007 |
+
gr.update(interactive=False),
|
1008 |
+
# [1] model_b_send: disable and show loading state
|
1009 |
+
gr.update(interactive=False, value="Processing...")
|
1010 |
+
)
|
1011 |
def handle_model_b_send(user_input, models_state, conversation_state):
|
1012 |
try:
|
1013 |
response = chat_with_models(
|
|
|
1018 |
response,
|
1019 |
conversation_state,
|
1020 |
gr.update(visible=False),
|
1021 |
+
gr.update(value="", interactive=True), # Clear and enable model_b_input
|
1022 |
+
gr.update(interactive=False, value="Send to Model B"), # Reset button text
|
|
|
|
|
1023 |
)
|
1024 |
except TimeoutError as e:
|
1025 |
# Disable inputs when timeout occurs
|
|
|
1027 |
gr.update(value=""), # Clear response
|
1028 |
conversation_state,
|
1029 |
gr.update(visible=True), # Show the timeout popup
|
1030 |
+
gr.update(interactive=True), # Re-enable model_b_input
|
1031 |
+
gr.update(interactive=True, value="Send to Model B"), # Re-enable model_b_send button
|
1032 |
)
|
1033 |
except Exception as e:
|
1034 |
raise gr.Error(str(e))
|
1035 |
|
1036 |
model_a_send.click(
|
1037 |
+
fn=disable_model_a_ui, # First disable UI
|
1038 |
+
inputs=[],
|
1039 |
+
outputs=[
|
1040 |
+
model_a_input,
|
1041 |
+
model_a_send
|
1042 |
+
]
|
1043 |
+
).then(
|
1044 |
+
fn=handle_model_a_send, # Then do the actual processing
|
1045 |
inputs=[model_a_input, models_state, conversation_state],
|
1046 |
outputs=[
|
1047 |
response_a,
|
|
|
1052 |
],
|
1053 |
)
|
1054 |
model_b_send.click(
|
1055 |
+
fn=disable_model_b_ui, # First disable UI
|
1056 |
+
inputs=[],
|
1057 |
+
outputs=[
|
1058 |
+
model_b_input,
|
1059 |
+
model_b_send
|
1060 |
+
]
|
1061 |
+
).then(
|
1062 |
+
fn=handle_model_b_send, # Then do the actual processing
|
1063 |
inputs=[model_b_input, models_state, conversation_state],
|
1064 |
outputs=[
|
1065 |
response_b,
|
|
|
1101 |
|
1102 |
# Adjust output count to match the interface definition
|
1103 |
return (
|
1104 |
+
gr.update(
|
1105 |
+
value="", interactive=True, visible=True
|
1106 |
+
), # [0] Clear shared_input textbox
|
1107 |
+
gr.update(
|
1108 |
+
value="", interactive=True, visible=True
|
1109 |
+
), # [1] Clear repo_url textbox
|
1110 |
+
gr.update(
|
1111 |
+
value="", visible=False
|
1112 |
+
), # [2] Hide user_prompt_md markdown component
|
1113 |
+
gr.update(
|
1114 |
+
value="", visible=False
|
1115 |
+
), # [3] Hide response_a_title markdown component
|
1116 |
+
gr.update(
|
1117 |
+
value="", visible=False
|
1118 |
+
), # [4] Hide response_b_title markdown component
|
1119 |
+
gr.update(value=""), # [5] Clear Model A response markdown component
|
1120 |
+
gr.update(value=""), # [6] Clear Model B response markdown component
|
1121 |
+
gr.update(visible=False), # [7] Hide multi_round_inputs row
|
1122 |
+
gr.update(visible=False), # [8] Hide vote_panel row
|
1123 |
+
gr.update(
|
1124 |
+
value="Submit", interactive=True, visible=True
|
1125 |
+
), # [9] Reset send_first button
|
1126 |
+
gr.update(
|
1127 |
+
value="Can't Decide", interactive=True
|
1128 |
+
), # [10] Reset feedback radio selection
|
1129 |
+
get_leaderboard_data(feedback_entry), # [11] Updated leaderboard data
|
1130 |
+
gr.update(
|
1131 |
+
visible=True
|
1132 |
+
), # [12] Show the thanks_message markdown component
|
1133 |
)
|
1134 |
|
1135 |
# Update the click event for the submit feedback button
|
context_window.json
CHANGED
@@ -22,7 +22,6 @@
|
|
22 |
"grok-3-beta": 1000000,
|
23 |
"grok-3-mini-fast-beta": 1000000,
|
24 |
"grok-3-mini-beta": 1000000,
|
25 |
-
"llama-3.1-8b": 128000,
|
26 |
"llama-3.1-405b": 128000,
|
27 |
"llama-3.3-70b": 128000,
|
28 |
"llama4-scout-instruct-basic": 10000000,
|
|
|
22 |
"grok-3-beta": 1000000,
|
23 |
"grok-3-mini-fast-beta": 1000000,
|
24 |
"grok-3-mini-beta": 1000000,
|
|
|
25 |
"llama-3.1-405b": 128000,
|
26 |
"llama-3.3-70b": 128000,
|
27 |
"llama4-scout-instruct-basic": 10000000,
|