updated app according to edit suggestions
Browse files- app.py +89 -131
- tiered_models_data.csv +21 -21
app.py
CHANGED
@@ -37,13 +37,11 @@ st.markdown(
|
|
37 |
}
|
38 |
|
39 |
.container {
|
40 |
-
max-width: 1000px;
|
41 |
-
margin: 0 auto;
|
42 |
padding: 20px;
|
43 |
}
|
44 |
|
45 |
-
|
46 |
-
|
47 |
table {
|
48 |
width: 100%;
|
49 |
border-collapse: collapse;
|
@@ -81,86 +79,31 @@ st.markdown('<div class="description">Benchmark for LM Factuality Evaluation</di
|
|
81 |
st.markdown('</div>', unsafe_allow_html=True)
|
82 |
|
83 |
# Load the data
|
84 |
-
# data_path = "factbench_data.csv"
|
85 |
data_path = "tiered_models_data.csv"
|
86 |
df = pd.read_csv(data_path)
|
87 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
88 |
# Create tabs
|
89 |
tab1, tab2, tab3 = st.tabs(
|
90 |
["Leaderboard", "Benchmark Details", "Submit your models"])
|
91 |
|
92 |
# Tab 1: Leaderboard
|
93 |
-
# with tab1:
|
94 |
-
# st.markdown('<div class="title">Leaderboard</div>',
|
95 |
-
# unsafe_allow_html=True)
|
96 |
-
# st.markdown('<div class="tab-content">', unsafe_allow_html=True)
|
97 |
-
|
98 |
-
# # Dropdown menu to filter tiers
|
99 |
-
# tiers = ['All Tiers', 'Tier 1: Easy', 'Tier 2: Moderate', 'Tier 3: Hard']
|
100 |
-
# selected_tier = st.selectbox('Select Tier:', tiers)
|
101 |
-
|
102 |
-
# # Filter the data based on the selected tier
|
103 |
-
# if selected_tier != 'All Tiers':
|
104 |
-
# filtered_df = df[df['Tier'] == selected_tier]
|
105 |
-
# else:
|
106 |
-
# filtered_df = df
|
107 |
-
|
108 |
-
# # Create HTML for the table
|
109 |
-
# html = '''
|
110 |
-
# <table>
|
111 |
-
# <thead>
|
112 |
-
# <tr>
|
113 |
-
# <th>Tier</th>
|
114 |
-
# <th>Model</th>
|
115 |
-
# <th>FactScore</th>
|
116 |
-
# <th>SAFE</th>
|
117 |
-
# <th>Factcheck-GPT</th>
|
118 |
-
# <th>VERIFY</th>
|
119 |
-
# </tr>
|
120 |
-
# </thead>
|
121 |
-
# <tbody>
|
122 |
-
# '''
|
123 |
-
|
124 |
-
# # Generate the rows of the table
|
125 |
-
# current_tier = None
|
126 |
-
# for i, row in filtered_df.iterrows():
|
127 |
-
# if row['Tier'] != current_tier:
|
128 |
-
# if current_tier is not None:
|
129 |
-
# # Close the previous tier row
|
130 |
-
# html += ' </tr>'
|
131 |
-
# current_tier = row['Tier']
|
132 |
-
# html += f' <tr><td rowspan="4" style="vertical-align: middle;">{current_tier}</td>'
|
133 |
-
# else:
|
134 |
-
# html += ' <tr>'
|
135 |
-
|
136 |
-
# # Fill in model and scores
|
137 |
-
# html += f'''
|
138 |
-
# <td>{row['Model']}</td>
|
139 |
-
# <td>{row['FactScore']:.2f}</td>
|
140 |
-
# <td>{row['SAFE']:.2f}</td>
|
141 |
-
# <td>{row['Factcheck-GPT']:.2f}</td>
|
142 |
-
# <td>{row['VERIFY']:.2f}</td>
|
143 |
-
# </tr>
|
144 |
-
# '''
|
145 |
-
|
146 |
-
# # Close the last row and table tags
|
147 |
-
# html += '''
|
148 |
-
# </table>
|
149 |
-
# '''
|
150 |
-
|
151 |
-
# # Display the table
|
152 |
-
# st.markdown(html, unsafe_allow_html=True)
|
153 |
-
|
154 |
-
# st.markdown('</div>', unsafe_allow_html=True)
|
155 |
-
df['rank'] = df['factuality_score'].rank(
|
156 |
-
ascending=False, method='min').astype(int)
|
157 |
-
|
158 |
with tab1:
|
|
|
|
|
159 |
st.markdown('<div class="title">Leaderboard</div>', unsafe_allow_html=True)
|
160 |
st.markdown('<div class="tab-content">', unsafe_allow_html=True)
|
161 |
|
162 |
# Dropdown menu to filter tiers
|
163 |
-
tiers = ['All Tiers', 'Tier 1:
|
164 |
selected_tier = st.selectbox('Select Tier:', tiers)
|
165 |
|
166 |
# Filter the data based on the selected tier
|
@@ -168,84 +111,98 @@ with tab1:
|
|
168 |
filtered_df = df[df['tier'] == selected_tier]
|
169 |
else:
|
170 |
filtered_df = df
|
171 |
-
|
172 |
-
# sort_order = st.radio('Sort by Factuality Score:',
|
173 |
-
# ('Ascending', 'Descending'))
|
174 |
-
|
175 |
-
# # Sort the dataframe based on Factuality Score
|
176 |
-
# if sort_order == 'Ascending':
|
177 |
-
# filtered_df = filtered_df.sort_values(
|
178 |
-
# by='factuality_score', ascending=True)
|
179 |
-
# else:
|
180 |
-
# filtered_df = filtered_df.sort_values(
|
181 |
-
# by='factuality_score', ascending=False)
|
182 |
-
# Option to sort by Factuality Score in ascending order
|
183 |
sort_by_factuality = st.checkbox('Sort by Factuality Score')
|
184 |
|
185 |
# Sort the dataframe based on Factuality Score if the checkbox is selected
|
186 |
if sort_by_factuality:
|
187 |
updated_filtered_df = filtered_df.sort_values(
|
188 |
-
by='factuality_score', ascending=False
|
|
|
189 |
else:
|
190 |
-
updated_filtered_df = filtered_df
|
|
|
|
|
191 |
|
192 |
# Create HTML for the table
|
193 |
-
|
194 |
-
|
195 |
-
<
|
196 |
-
<
|
197 |
-
<
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
217 |
|
218 |
# Generate the rows of the table
|
219 |
current_tier = None
|
220 |
for i, row in updated_filtered_df.iterrows():
|
221 |
-
|
222 |
-
|
223 |
-
#
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
html += ' <tr>'
|
231 |
# Fill in model and scores
|
232 |
html += f'''
|
233 |
<td>{row['rank']}</td>
|
234 |
-
<td>{row['tier']}</td>
|
235 |
<td>{row['model']}</td>
|
236 |
-
<td>{row['factuality_score']
|
237 |
-
<td>{row['
|
238 |
-
<td>{row['
|
239 |
-
<td>{row['
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
240 |
<td>{row['avg_undecidable_units']:.2f}</td>
|
241 |
<td>{row['avg_unsupported_units']:.2f}</td>
|
242 |
-
<td>{row['prompt_categories.Factual Recall']:.2f}</td>
|
243 |
-
<td>{row['prompt_categories.Conceptual Understanding']:.2f}</td>
|
244 |
-
<td>{row['prompt_categories.Procedural Execution']:.2f}</td>
|
245 |
-
<td>{row['prompt_categories.Comparative Analysis']:.2f}</td>
|
246 |
-
<td>{row['prompt_categories.Recommendations and Insights']:.2f}</td>
|
247 |
-
<td>{row['prompt_categories.Domain-Specific Knowledge']:.2f}</td>
|
248 |
-
<td>{row['prompt_categories.Temporal Context']:.2f}</td>
|
249 |
</tr>
|
250 |
'''
|
251 |
|
@@ -258,6 +215,7 @@ with tab1:
|
|
258 |
st.markdown(html, unsafe_allow_html=True)
|
259 |
|
260 |
st.markdown('</div>', unsafe_allow_html=True)
|
|
|
261 |
# Tab 2: Details
|
262 |
with tab2:
|
263 |
st.markdown('<div class="tab-content">', unsafe_allow_html=True)
|
|
|
37 |
}
|
38 |
|
39 |
.container {
|
40 |
+
max-width: 1000px;
|
41 |
+
margin: 0 auto;
|
42 |
padding: 20px;
|
43 |
}
|
44 |
|
|
|
|
|
45 |
table {
|
46 |
width: 100%;
|
47 |
border-collapse: collapse;
|
|
|
79 |
st.markdown('</div>', unsafe_allow_html=True)
|
80 |
|
81 |
# Load the data
|
|
|
82 |
data_path = "tiered_models_data.csv"
|
83 |
df = pd.read_csv(data_path)
|
84 |
|
85 |
+
# Assign ranks within each tier based on factuality_score
|
86 |
+
df['rank'] = df.groupby('tier')['factuality_score'].rank(
|
87 |
+
ascending=False, method='min').astype(int)
|
88 |
+
|
89 |
+
# Replace NaN values with '-'
|
90 |
+
df.fillna('-', inplace=True)
|
91 |
+
|
92 |
+
df['original_order'] = df.groupby('tier').cumcount()
|
93 |
+
|
94 |
# Create tabs
|
95 |
tab1, tab2, tab3 = st.tabs(
|
96 |
["Leaderboard", "Benchmark Details", "Submit your models"])
|
97 |
|
98 |
# Tab 1: Leaderboard
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
99 |
with tab1:
|
100 |
+
# df['original_order'] = df.groupby('tier').cumcount()
|
101 |
+
# print(df['original_order'])
|
102 |
st.markdown('<div class="title">Leaderboard</div>', unsafe_allow_html=True)
|
103 |
st.markdown('<div class="tab-content">', unsafe_allow_html=True)
|
104 |
|
105 |
# Dropdown menu to filter tiers
|
106 |
+
tiers = ['All Tiers', 'Tier 1: Hard', 'Tier 2: Moderate', 'Tier 3: Easy']
|
107 |
selected_tier = st.selectbox('Select Tier:', tiers)
|
108 |
|
109 |
# Filter the data based on the selected tier
|
|
|
111 |
filtered_df = df[df['tier'] == selected_tier]
|
112 |
else:
|
113 |
filtered_df = df
|
114 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
115 |
sort_by_factuality = st.checkbox('Sort by Factuality Score')
|
116 |
|
117 |
# Sort the dataframe based on Factuality Score if the checkbox is selected
|
118 |
if sort_by_factuality:
|
119 |
updated_filtered_df = filtered_df.sort_values(
|
120 |
+
by=['tier', 'factuality_score'], ascending=[True, False]
|
121 |
+
)
|
122 |
else:
|
123 |
+
updated_filtered_df = filtered_df.sort_values(
|
124 |
+
by=['tier', 'original_order']
|
125 |
+
)
|
126 |
|
127 |
# Create HTML for the table
|
128 |
+
if selected_tier == 'All Tiers':
|
129 |
+
html = '''
|
130 |
+
<table>
|
131 |
+
<thead>
|
132 |
+
<tr>
|
133 |
+
<th>Tier</th>
|
134 |
+
<th>Rank</th>
|
135 |
+
<th>Model</th>
|
136 |
+
<th>Factuality Score</th>
|
137 |
+
<th>Factual Recall</th>
|
138 |
+
<th>Conceptual Understanding</th>
|
139 |
+
<th>Procedural Execution</th>
|
140 |
+
<th>Comparative Analysis</th>
|
141 |
+
<th>Recommendations and Insights</th>
|
142 |
+
<th>Domain-Specific Knowledge</th>
|
143 |
+
<th>Temporal Context</th>
|
144 |
+
<th>Hallucination Score</th>
|
145 |
+
<th># Tokens</th>
|
146 |
+
<th># Factual</th>
|
147 |
+
<th># Undecidable</th>
|
148 |
+
<th># Unsupported</th>
|
149 |
+
</tr>
|
150 |
+
</thead>
|
151 |
+
<tbody>
|
152 |
+
'''
|
153 |
+
else:
|
154 |
+
html = '''
|
155 |
+
<table>
|
156 |
+
<thead>
|
157 |
+
<tr>
|
158 |
+
<th>Rank</th>
|
159 |
+
<th>Model</th>
|
160 |
+
<th>Factuality Score</th>
|
161 |
+
<th>Factual Recall</th>
|
162 |
+
<th>Conceptual Understanding</th>
|
163 |
+
<th>Procedural Execution</th>
|
164 |
+
<th>Comparative Analysis</th>
|
165 |
+
<th>Recommendations and Insights</th>
|
166 |
+
<th>Domain-Specific Knowledge</th>
|
167 |
+
<th>Temporal Context</th>
|
168 |
+
<th>Hallucination Score</th>
|
169 |
+
<th># Tokens</th>
|
170 |
+
<th># Factual</th>
|
171 |
+
<th># Undecidable</th>
|
172 |
+
<th># Unsupported</th>
|
173 |
+
</tr>
|
174 |
+
</thead>
|
175 |
+
<tbody>
|
176 |
+
'''
|
177 |
|
178 |
# Generate the rows of the table
|
179 |
current_tier = None
|
180 |
for i, row in updated_filtered_df.iterrows():
|
181 |
+
html += '<tr>'
|
182 |
+
|
183 |
+
# Only display the 'Tier' column if 'All Tiers' is selected
|
184 |
+
if selected_tier == 'All Tiers':
|
185 |
+
if row['tier'] != current_tier:
|
186 |
+
current_tier = row['tier']
|
187 |
+
html += f'<td rowspan="7" style="vertical-align: middle;">{current_tier}</td>'
|
188 |
+
|
|
|
|
|
189 |
# Fill in model and scores
|
190 |
html += f'''
|
191 |
<td>{row['rank']}</td>
|
|
|
192 |
<td>{row['model']}</td>
|
193 |
+
<td>{row['factuality_score']}</td>
|
194 |
+
<td>{row['prompt_categories.Factual Recall']}</td>
|
195 |
+
<td>{row['prompt_categories.Conceptual Understanding']}</td>
|
196 |
+
<td>{row['prompt_categories.Procedural Execution']}</td>
|
197 |
+
<td>{row['prompt_categories.Comparative Analysis']}</td>
|
198 |
+
<td>{row['prompt_categories.Recommendations and Insights']}</td>
|
199 |
+
<td>{row['prompt_categories.Domain-Specific Knowledge']}</td>
|
200 |
+
<td>{row['prompt_categories.Temporal Context']}</td>
|
201 |
+
<td>{row['hallucination_score']}</td>
|
202 |
+
<td>{row['avg_tokens']}</td>
|
203 |
+
<td>{row['avg_factual_units']}</td>
|
204 |
<td>{row['avg_undecidable_units']:.2f}</td>
|
205 |
<td>{row['avg_unsupported_units']:.2f}</td>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
206 |
</tr>
|
207 |
'''
|
208 |
|
|
|
215 |
st.markdown(html, unsafe_allow_html=True)
|
216 |
|
217 |
st.markdown('</div>', unsafe_allow_html=True)
|
218 |
+
|
219 |
# Tab 2: Details
|
220 |
with tab2:
|
221 |
st.markdown('<div class="tab-content">', unsafe_allow_html=True)
|
tiered_models_data.csv
CHANGED
@@ -1,23 +1,23 @@
|
|
1 |
tier,model,factuality_score,hallucination_score,avg_tokens,avg_factual_units,avg_undecidable_units,avg_unsupported_units,prompt_categories.Factual Recall,prompt_categories.Conceptual Understanding,prompt_categories.Procedural Execution,prompt_categories.Comparative Analysis,prompt_categories.Recommendations and Insights,prompt_categories.Domain-Specific Knowledge,prompt_categories.Temporal Context
|
2 |
-
Tier 1:
|
3 |
-
Tier 1:
|
4 |
-
Tier 1:
|
5 |
-
Tier 1:
|
6 |
-
Tier 1:
|
7 |
-
Tier 1:
|
8 |
-
Tier 1:
|
9 |
-
Tier 2: Moderate,
|
10 |
-
Tier 2: Moderate,
|
11 |
-
Tier 2: Moderate,
|
12 |
-
Tier 2: Moderate,
|
13 |
-
Tier 2: Moderate,
|
14 |
-
Tier 2: Moderate,
|
15 |
-
Tier 2: Moderate,
|
16 |
-
Tier 3:
|
17 |
-
Tier 3:
|
18 |
-
Tier 3:
|
19 |
-
Tier 3:
|
20 |
-
Tier 3:
|
21 |
-
Tier 3:
|
22 |
-
Tier 3:
|
23 |
|
|
|
1 |
tier,model,factuality_score,hallucination_score,avg_tokens,avg_factual_units,avg_undecidable_units,avg_unsupported_units,prompt_categories.Factual Recall,prompt_categories.Conceptual Understanding,prompt_categories.Procedural Execution,prompt_categories.Comparative Analysis,prompt_categories.Recommendations and Insights,prompt_categories.Domain-Specific Knowledge,prompt_categories.Temporal Context
|
2 |
+
Tier 1: Hard,GPT4-o,75.69,0.64,561.72,23.91,4.61,1.01,76.49,78.49,66.14,76.13,76.3,75.91,69.52
|
3 |
+
Tier 1: Hard,Gemini1.5-Pro,73.81,0.68,516.41,22.23,4.47,1.12,73.35,79.39,66.7,72.44,73.64,74.31,71.42
|
4 |
+
Tier 1: Hard,Llama3.1-70B-Instruct,70.01,0.89,531.35,27.09,5.67,2.13,68.99,75.38,64.73,70.34,70.03,70.64,56.61
|
5 |
+
Tier 1: Hard,Llama3.1-405B-Instruct,68.64,0.93,550.74,26.6,6.15,2.19,66.07,74.67,65.88,70.18,68.29,70.91,49.97
|
6 |
+
Tier 1: Hard,Claude-3.5-Sonnet,74.95,0.65,395.77,22.64,4.03,1.19,74.84,77.74,69.55,74.87,75.3,76.4,64.19
|
7 |
+
Tier 1: Hard,CommandR+,73.15,0.71,440.93,23.55,4.51,1.4,69.41,80.24,68.98,74.36,73.53,73.02,66.43
|
8 |
+
Tier 1: Hard,Mistral-Large-2,75.19,0.67,485.58,23.21,4.09,1.36,75.87,78.32,63.98,77.17,75.5,76.38,65.8
|
9 |
+
Tier 2: Moderate,GPT4-o,80.72,0.5,624.67,24.42,3.59,0.89,80.06,84.33,72.83,79.75,81.5,81.1,70.02
|
10 |
+
Tier 2: Moderate,Gemini1.5-Pro,78.02,0.57,565.97,22.16,3.71,0.97,74.13,81.74,73.13,77.32,78.37,80.04,68.03
|
11 |
+
Tier 2: Moderate,Llama3.1-70B-Instruct,75.76,0.71,607.44,25.35,4.33,1.76,63.87,77.92,72.94,78.67,79.56,76.83,47.71
|
12 |
+
Tier 2: Moderate,Llama3.1-405B-Instruct,75.05,0.7,599.3,25.24,4.74,1.41,67.96,78.09,68.51,76.16,77.31,76.25,65.43
|
13 |
+
Tier 2: Moderate,Claude-3.5-Sonnet,79.92,0.54,414.32,22.15,3.32,1.09,75.88,83.52,77.39,79.31,81.06,78.81,72.47
|
14 |
+
Tier 2: Moderate,CommandR+,80.71,0.52,483.32,24.1,3.17,1.09,73.49,85.46,75.6,82.97,82.12,81.61,58.49
|
15 |
+
Tier 2: Moderate,Mistral-Large-2,79.97,0.52,528.44,22.65,3.21,1.02,77.21,81.23,75.2,81.24,80.86,82.03,63.63
|
16 |
+
Tier 3: Easy,GPT4-o,91.63,0.26,640.84,29.29,2.01,0.53,94.31,93.62,82.98,89.19,91.86,94.12
|
17 |
+
Tier 3: Easy,Gemini1.5-Pro,89.86,0.31,551.81,25.6,1.88,0.71,92.61,90.34,83.32,87.39,90.93,95.23
|
18 |
+
Tier 3: Easy,Llama3.1-70B-Instruct,89.3,0.33,607.75,31.38,2.08,0.83,75.5,91.75,83.61,87.11,93.03,93.08
|
19 |
+
Tier 3: Easy,Llama3.1-405B-Instruct,86.57,0.4,599.87,30.12,2.88,0.85,79.58,88.92,75.23,85.11,89.2,90.21,100.0
|
20 |
+
Tier 3: Easy,Claude-3.5-Sonnet,89.61,0.3,411.2,26.72,1.49,0.81,89.85,92.45,75.13,86.48,91.46,91.97,100.0
|
21 |
+
Tier 3: Easy,CommandR+,91.65,0.25,499.06,27.95,1.57,0.54,87.71,91.8,87.16,89.79,94.12,93.85,100.0
|
22 |
+
Tier 3: Easy,Mistral-Large-2,92.0,0.25,523.57,27.8,1.8,0.55,92.96,92.33,90.58,89.41,92.81,92.41,100.0
|
23 |
|