justinxzhao
commited on
Commit
•
a056e0b
1
Parent(s):
312e7a9
Initial version of data tab browser.
Browse files- .gitignore +2 -1
- app.py +247 -2
.gitignore
CHANGED
@@ -1 +1,2 @@
|
|
1 |
-
env/
|
|
|
|
1 |
+
env/
|
2 |
+
.DS_Store
|
app.py
CHANGED
@@ -1,4 +1,249 @@
|
|
1 |
import streamlit as st
|
|
|
2 |
|
3 |
-
|
4 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import streamlit as st
|
2 |
+
import pandas as pd
|
3 |
|
4 |
+
# Define constants
|
5 |
+
MAJOR_A_WIN = "A>>B"
|
6 |
+
MINOR_A_WIN = "A>B"
|
7 |
+
MINOR_B_WIN = "B>A"
|
8 |
+
MAJOR_B_WIN = "B>>A"
|
9 |
+
TIE = "A=B"
|
10 |
+
|
11 |
+
|
12 |
+
def is_consistent(rating, reverse_rating):
|
13 |
+
if rating in {MAJOR_A_WIN, MINOR_A_WIN} and reverse_rating in {
|
14 |
+
MAJOR_B_WIN,
|
15 |
+
MINOR_B_WIN,
|
16 |
+
}:
|
17 |
+
return True
|
18 |
+
if rating in {MAJOR_B_WIN, MINOR_B_WIN} and reverse_rating in {
|
19 |
+
MAJOR_A_WIN,
|
20 |
+
MINOR_A_WIN,
|
21 |
+
}:
|
22 |
+
return True
|
23 |
+
if reverse_rating in {MAJOR_A_WIN, MINOR_A_WIN} and rating in {
|
24 |
+
MAJOR_B_WIN,
|
25 |
+
MINOR_B_WIN,
|
26 |
+
}:
|
27 |
+
return True
|
28 |
+
if reverse_rating in {MAJOR_B_WIN, MINOR_B_WIN} and rating in {
|
29 |
+
MAJOR_A_WIN,
|
30 |
+
MINOR_A_WIN,
|
31 |
+
}:
|
32 |
+
return True
|
33 |
+
if reverse_rating in {TIE} and rating in {TIE}:
|
34 |
+
return True
|
35 |
+
if reverse_rating in {TIE} and rating not in {TIE}:
|
36 |
+
return False
|
37 |
+
if rating in {TIE} and reverse_rating not in {TIE}:
|
38 |
+
return False
|
39 |
+
return False
|
40 |
+
|
41 |
+
|
42 |
+
# Load your dataframes
|
43 |
+
df_test_set = pd.read_json("data/test_set.jsonl", lines=True)
|
44 |
+
df_responses = pd.read_json("data/responses.jsonl", lines=True)
|
45 |
+
df_response_judging = pd.read_json("data/response_judging.jsonl", lines=True)
|
46 |
+
|
47 |
+
# Prepare the scenario selector options
|
48 |
+
df_test_set["scenario_option"] = (
|
49 |
+
df_test_set["emobench_id"].astype(str) + ": " + df_test_set["scenario"]
|
50 |
+
)
|
51 |
+
scenario_options = df_test_set["scenario_option"].tolist()
|
52 |
+
|
53 |
+
# Prepare the model selector options
|
54 |
+
model_options = df_responses["llm_responder"].unique().tolist()
|
55 |
+
|
56 |
+
# Prepare the judge selector options
|
57 |
+
judge_options = df_response_judging["llm_judge"].unique().tolist()
|
58 |
+
|
59 |
+
st.set_page_config(page_title="Language Model Council", page_icon="🧊", layout="wide")
|
60 |
+
|
61 |
+
# Create three columns
|
62 |
+
col1, col2, col3 = st.columns(3)
|
63 |
+
|
64 |
+
# Define CSS to make buttons take full space
|
65 |
+
full_width_button_css = """
|
66 |
+
<style>
|
67 |
+
div.stButton > button {
|
68 |
+
width: 100%;
|
69 |
+
}
|
70 |
+
</style>
|
71 |
+
"""
|
72 |
+
|
73 |
+
st.markdown(full_width_button_css, unsafe_allow_html=True)
|
74 |
+
|
75 |
+
# Place a button in each column
|
76 |
+
with col1:
|
77 |
+
if st.button("Blog"):
|
78 |
+
st.write("Button 1 clicked")
|
79 |
+
|
80 |
+
with col2:
|
81 |
+
if st.button("Paper"):
|
82 |
+
st.write("Button 2 clicked")
|
83 |
+
|
84 |
+
with col3:
|
85 |
+
if st.button("Github"):
|
86 |
+
st.write("Button 3 clicked")
|
87 |
+
|
88 |
+
# Custom CSS to center title and header
|
89 |
+
center_css = """
|
90 |
+
<style>
|
91 |
+
h1, h2, h3, h4, h5, h6 {
|
92 |
+
text-align: center;
|
93 |
+
}
|
94 |
+
</style>
|
95 |
+
"""
|
96 |
+
|
97 |
+
st.markdown(center_css, unsafe_allow_html=True)
|
98 |
+
|
99 |
+
st.title("Language Model Council")
|
100 |
+
st.subheader("Applied to emotional intelligence")
|
101 |
+
|
102 |
+
# Create horizontal tabs
|
103 |
+
tabs = st.tabs(["Leaderboard Results", "Data Samples", "About Us"])
|
104 |
+
|
105 |
+
# Define content for each tab
|
106 |
+
with tabs[0]:
|
107 |
+
st.write("This is the leaderboard results page.")
|
108 |
+
# Add your leaderboard results content here
|
109 |
+
leaderboard = {"Name": ["Alice", "Bob", "Charlie"], "Score": [95, 85, 75]}
|
110 |
+
st.table(leaderboard)
|
111 |
+
|
112 |
+
with tabs[1]:
|
113 |
+
# Create the selectors
|
114 |
+
selected_scenario = st.selectbox("Select Scenario", scenario_options)
|
115 |
+
|
116 |
+
# Get the selected scenario details
|
117 |
+
if selected_scenario:
|
118 |
+
selected_emobench_id = int(selected_scenario.split(": ")[0])
|
119 |
+
scenario_details = df_test_set[
|
120 |
+
df_test_set["emobench_id"] == selected_emobench_id
|
121 |
+
].iloc[0]
|
122 |
+
|
123 |
+
# Display the detailed dilemma and additional information
|
124 |
+
st.write(scenario_details["detailed_dilemma"])
|
125 |
+
with st.expander("Additional Information"):
|
126 |
+
st.write(f"**LLM Author:** {scenario_details['llm_author']}")
|
127 |
+
st.write(f"**Problem:** {scenario_details['problem']}")
|
128 |
+
st.write(f"**Relationship:** {scenario_details['relationship']}")
|
129 |
+
st.write(f"**Scenario:** {scenario_details['scenario']}")
|
130 |
+
|
131 |
+
st.divider()
|
132 |
+
|
133 |
+
# Create two columns for model selectors
|
134 |
+
col1, col2 = st.columns(2)
|
135 |
+
|
136 |
+
with col1:
|
137 |
+
fixed_model = "qwen1.5-32B-Chat"
|
138 |
+
st.selectbox("Select Model", [fixed_model], key="fixed_model")
|
139 |
+
|
140 |
+
# Get the response string for the fixed model
|
141 |
+
if selected_scenario:
|
142 |
+
response_details_fixed = df_responses[
|
143 |
+
(df_responses["emobench_id"] == selected_emobench_id)
|
144 |
+
& (df_responses["llm_responder"] == fixed_model)
|
145 |
+
].iloc[0]
|
146 |
+
|
147 |
+
# Display the response string
|
148 |
+
st.write(response_details_fixed["response_string"])
|
149 |
+
|
150 |
+
with col2:
|
151 |
+
selected_model = st.selectbox(
|
152 |
+
"Select Model", model_options, key="dynamic_model"
|
153 |
+
)
|
154 |
+
|
155 |
+
# Get the response string for the selected model
|
156 |
+
if selected_model and selected_scenario:
|
157 |
+
response_details_dynamic = df_responses[
|
158 |
+
(df_responses["emobench_id"] == selected_emobench_id)
|
159 |
+
& (df_responses["llm_responder"] == selected_model)
|
160 |
+
].iloc[0]
|
161 |
+
|
162 |
+
# Display the response string
|
163 |
+
st.write(response_details_dynamic["response_string"])
|
164 |
+
|
165 |
+
st.divider()
|
166 |
+
|
167 |
+
# Create the llm_judge selector
|
168 |
+
selected_judge = st.selectbox("Select Judge", judge_options)
|
169 |
+
|
170 |
+
# Get the judging details for the selected judge and models
|
171 |
+
if selected_judge and selected_scenario:
|
172 |
+
col1, col2 = st.columns(2)
|
173 |
+
|
174 |
+
judging_details_left = df_response_judging[
|
175 |
+
(df_response_judging["llm_judge"] == selected_judge)
|
176 |
+
& (df_response_judging["first_completion_by"] == fixed_model)
|
177 |
+
& (df_response_judging["second_completion_by"] == selected_model)
|
178 |
+
].iloc[0]
|
179 |
+
|
180 |
+
judging_details_right = df_response_judging[
|
181 |
+
(df_response_judging["llm_judge"] == selected_judge)
|
182 |
+
& (df_response_judging["first_completion_by"] == selected_model)
|
183 |
+
& (df_response_judging["second_completion_by"] == fixed_model)
|
184 |
+
].iloc[0]
|
185 |
+
|
186 |
+
if is_consistent(
|
187 |
+
judging_details_left["pairwise_choice"],
|
188 |
+
judging_details_right["pairwise_choice"],
|
189 |
+
):
|
190 |
+
st.success("The judge ratings are consistent.", icon="✅")
|
191 |
+
else:
|
192 |
+
st.warning("The judge ratings are inconsistent.", icon="⚠️")
|
193 |
+
|
194 |
+
# Display the judging details
|
195 |
+
with col1:
|
196 |
+
st.write(f"**{fixed_model}** vs **{selected_model}**")
|
197 |
+
if not judging_details_left.empty:
|
198 |
+
st.write(
|
199 |
+
f"**Pairwise Choice:** {judging_details_left['pairwise_choice']}"
|
200 |
+
)
|
201 |
+
st.code(judging_details_left["judging_response_string"])
|
202 |
+
else:
|
203 |
+
st.write("No judging details found for the selected combination.")
|
204 |
+
|
205 |
+
with col2:
|
206 |
+
st.write(f"**{selected_model}** vs **{fixed_model}**")
|
207 |
+
if not judging_details_right.empty:
|
208 |
+
st.write(
|
209 |
+
f"**Pairwise Choice:** {judging_details_right['pairwise_choice']}"
|
210 |
+
)
|
211 |
+
st.code(judging_details_right["judging_response_string"])
|
212 |
+
else:
|
213 |
+
st.write("No judging details found for the selected combination.")
|
214 |
+
|
215 |
+
st.divider()
|
216 |
+
|
217 |
+
# Add bar charts for value counts of pairwise choices over all judges
|
218 |
+
col1, col2 = st.columns(2)
|
219 |
+
|
220 |
+
with col1:
|
221 |
+
pairwise_counts_left = df_response_judging[
|
222 |
+
(df_response_judging["first_completion_by"] == fixed_model)
|
223 |
+
& (df_response_judging["second_completion_by"] == selected_model)
|
224 |
+
]["pairwise_choice"].value_counts()
|
225 |
+
|
226 |
+
st.bar_chart(pairwise_counts_left)
|
227 |
+
|
228 |
+
with col2:
|
229 |
+
pairwise_counts_right = df_response_judging[
|
230 |
+
(df_response_judging["first_completion_by"] == selected_model)
|
231 |
+
& (df_response_judging["second_completion_by"] == fixed_model)
|
232 |
+
]["pairwise_choice"].value_counts()
|
233 |
+
|
234 |
+
st.bar_chart(pairwise_counts_right)
|
235 |
+
|
236 |
+
with tabs[2]:
|
237 |
+
st.write("This is the about us page.")
|
238 |
+
# Add your about us content here
|
239 |
+
st.write(
|
240 |
+
"""
|
241 |
+
**Our Mission:**
|
242 |
+
To provide the best service and data insights.
|
243 |
+
|
244 |
+
**Our Team:**
|
245 |
+
- Alice
|
246 |
+
- Bob
|
247 |
+
- Charlie
|
248 |
+
"""
|
249 |
+
)
|