shezamunir commited on
Commit
d73147f
·
verified ·
1 Parent(s): 45ea86a

updated app according to edit suggestions

Browse files
Files changed (2) hide show
  1. app.py +89 -131
  2. tiered_models_data.csv +21 -21
app.py CHANGED
@@ -37,13 +37,11 @@ st.markdown(
37
  }
38
 
39
  .container {
40
- max-width: 1000px; /* Set a max-width for the container */
41
- margin: 0 auto; /* Center the container */
42
  padding: 20px;
43
  }
44
 
45
-
46
-
47
  table {
48
  width: 100%;
49
  border-collapse: collapse;
@@ -81,86 +79,31 @@ st.markdown('<div class="description">Benchmark for LM Factuality Evaluation</di
81
  st.markdown('</div>', unsafe_allow_html=True)
82
 
83
  # Load the data
84
- # data_path = "factbench_data.csv"
85
  data_path = "tiered_models_data.csv"
86
  df = pd.read_csv(data_path)
87
 
 
 
 
 
 
 
 
 
 
88
  # Create tabs
89
  tab1, tab2, tab3 = st.tabs(
90
  ["Leaderboard", "Benchmark Details", "Submit your models"])
91
 
92
  # Tab 1: Leaderboard
93
- # with tab1:
94
- # st.markdown('<div class="title">Leaderboard</div>',
95
- # unsafe_allow_html=True)
96
- # st.markdown('<div class="tab-content">', unsafe_allow_html=True)
97
-
98
- # # Dropdown menu to filter tiers
99
- # tiers = ['All Tiers', 'Tier 1: Easy', 'Tier 2: Moderate', 'Tier 3: Hard']
100
- # selected_tier = st.selectbox('Select Tier:', tiers)
101
-
102
- # # Filter the data based on the selected tier
103
- # if selected_tier != 'All Tiers':
104
- # filtered_df = df[df['Tier'] == selected_tier]
105
- # else:
106
- # filtered_df = df
107
-
108
- # # Create HTML for the table
109
- # html = '''
110
- # <table>
111
- # <thead>
112
- # <tr>
113
- # <th>Tier</th>
114
- # <th>Model</th>
115
- # <th>FactScore</th>
116
- # <th>SAFE</th>
117
- # <th>Factcheck-GPT</th>
118
- # <th>VERIFY</th>
119
- # </tr>
120
- # </thead>
121
- # <tbody>
122
- # '''
123
-
124
- # # Generate the rows of the table
125
- # current_tier = None
126
- # for i, row in filtered_df.iterrows():
127
- # if row['Tier'] != current_tier:
128
- # if current_tier is not None:
129
- # # Close the previous tier row
130
- # html += ' </tr>'
131
- # current_tier = row['Tier']
132
- # html += f' <tr><td rowspan="4" style="vertical-align: middle;">{current_tier}</td>'
133
- # else:
134
- # html += ' <tr>'
135
-
136
- # # Fill in model and scores
137
- # html += f'''
138
- # <td>{row['Model']}</td>
139
- # <td>{row['FactScore']:.2f}</td>
140
- # <td>{row['SAFE']:.2f}</td>
141
- # <td>{row['Factcheck-GPT']:.2f}</td>
142
- # <td>{row['VERIFY']:.2f}</td>
143
- # </tr>
144
- # '''
145
-
146
- # # Close the last row and table tags
147
- # html += '''
148
- # </table>
149
- # '''
150
-
151
- # # Display the table
152
- # st.markdown(html, unsafe_allow_html=True)
153
-
154
- # st.markdown('</div>', unsafe_allow_html=True)
155
- df['rank'] = df['factuality_score'].rank(
156
- ascending=False, method='min').astype(int)
157
-
158
  with tab1:
 
 
159
  st.markdown('<div class="title">Leaderboard</div>', unsafe_allow_html=True)
160
  st.markdown('<div class="tab-content">', unsafe_allow_html=True)
161
 
162
  # Dropdown menu to filter tiers
163
- tiers = ['All Tiers', 'Tier 1: Easy', 'Tier 2: Moderate', 'Tier 3: Hard']
164
  selected_tier = st.selectbox('Select Tier:', tiers)
165
 
166
  # Filter the data based on the selected tier
@@ -168,84 +111,98 @@ with tab1:
168
  filtered_df = df[df['tier'] == selected_tier]
169
  else:
170
  filtered_df = df
171
- # Add sorting functionality for Factuality Score
172
- # sort_order = st.radio('Sort by Factuality Score:',
173
- # ('Ascending', 'Descending'))
174
-
175
- # # Sort the dataframe based on Factuality Score
176
- # if sort_order == 'Ascending':
177
- # filtered_df = filtered_df.sort_values(
178
- # by='factuality_score', ascending=True)
179
- # else:
180
- # filtered_df = filtered_df.sort_values(
181
- # by='factuality_score', ascending=False)
182
- # Option to sort by Factuality Score in ascending order
183
  sort_by_factuality = st.checkbox('Sort by Factuality Score')
184
 
185
  # Sort the dataframe based on Factuality Score if the checkbox is selected
186
  if sort_by_factuality:
187
  updated_filtered_df = filtered_df.sort_values(
188
- by='factuality_score', ascending=False)
 
189
  else:
190
- updated_filtered_df = filtered_df
 
 
191
 
192
  # Create HTML for the table
193
- html = '''
194
- <table>
195
- <thead>
196
- <tr>
197
- <th>Rank</th>
198
- <th>Tier</th>
199
- <th>Model</th>
200
- <th>Factuality Score</th>
201
- <th>Hallucination Score</th>
202
- <th>Avg Tokens</th>
203
- <th>Avg Factual Units</th>
204
- <th>Avg Undecidable Units</th>
205
- <th>Avg Unsupported Units</th>
206
- <th>Factual Recall</th>
207
- <th>Conceptual Understanding</th>
208
- <th>Procedural Execution</th>
209
- <th>Comparative Analysis</th>
210
- <th>Recommendations and Insights</th>
211
- <th>Domain-Specific Knowledge</th>
212
- <th>Temporal Context</th>
213
- </tr>
214
- </thead>
215
- <tbody>
216
- '''
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
217
 
218
  # Generate the rows of the table
219
  current_tier = None
220
  for i, row in updated_filtered_df.iterrows():
221
- # if row['tier'] != current_tier:
222
- # if current_tier is not None:
223
- # html += ' </tr>'
224
- # current_tier = row['tier']
225
- # # 7 models, change this number when more models
226
- # html += f' <tr><td rowspan="7" style="vertical-align: middle;">{current_tier}</td>'
227
- # else:
228
- # html += ' <tr>'
229
-
230
- html += ' <tr>'
231
  # Fill in model and scores
232
  html += f'''
233
  <td>{row['rank']}</td>
234
- <td>{row['tier']}</td>
235
  <td>{row['model']}</td>
236
- <td>{row['factuality_score']:.2f}</td>
237
- <td>{row['hallucination_score']:.2f}</td>
238
- <td>{row['avg_tokens']:.2f}</td>
239
- <td>{row['avg_factual_units']:.2f}</td>
 
 
 
 
 
 
 
240
  <td>{row['avg_undecidable_units']:.2f}</td>
241
  <td>{row['avg_unsupported_units']:.2f}</td>
242
- <td>{row['prompt_categories.Factual Recall']:.2f}</td>
243
- <td>{row['prompt_categories.Conceptual Understanding']:.2f}</td>
244
- <td>{row['prompt_categories.Procedural Execution']:.2f}</td>
245
- <td>{row['prompt_categories.Comparative Analysis']:.2f}</td>
246
- <td>{row['prompt_categories.Recommendations and Insights']:.2f}</td>
247
- <td>{row['prompt_categories.Domain-Specific Knowledge']:.2f}</td>
248
- <td>{row['prompt_categories.Temporal Context']:.2f}</td>
249
  </tr>
250
  '''
251
 
@@ -258,6 +215,7 @@ with tab1:
258
  st.markdown(html, unsafe_allow_html=True)
259
 
260
  st.markdown('</div>', unsafe_allow_html=True)
 
261
  # Tab 2: Details
262
  with tab2:
263
  st.markdown('<div class="tab-content">', unsafe_allow_html=True)
 
37
  }
38
 
39
  .container {
40
+ max-width: 1000px;
41
+ margin: 0 auto;
42
  padding: 20px;
43
  }
44
 
 
 
45
  table {
46
  width: 100%;
47
  border-collapse: collapse;
 
79
  st.markdown('</div>', unsafe_allow_html=True)
80
 
81
  # Load the data
 
82
  data_path = "tiered_models_data.csv"
83
  df = pd.read_csv(data_path)
84
 
85
+ # Assign ranks within each tier based on factuality_score
86
+ df['rank'] = df.groupby('tier')['factuality_score'].rank(
87
+ ascending=False, method='min').astype(int)
88
+
89
+ # Replace NaN values with '-'
90
+ df.fillna('-', inplace=True)
91
+
92
+ df['original_order'] = df.groupby('tier').cumcount()
93
+
94
  # Create tabs
95
  tab1, tab2, tab3 = st.tabs(
96
  ["Leaderboard", "Benchmark Details", "Submit your models"])
97
 
98
  # Tab 1: Leaderboard
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
  with tab1:
100
+ # df['original_order'] = df.groupby('tier').cumcount()
101
+ # print(df['original_order'])
102
  st.markdown('<div class="title">Leaderboard</div>', unsafe_allow_html=True)
103
  st.markdown('<div class="tab-content">', unsafe_allow_html=True)
104
 
105
  # Dropdown menu to filter tiers
106
+ tiers = ['All Tiers', 'Tier 1: Hard', 'Tier 2: Moderate', 'Tier 3: Easy']
107
  selected_tier = st.selectbox('Select Tier:', tiers)
108
 
109
  # Filter the data based on the selected tier
 
111
  filtered_df = df[df['tier'] == selected_tier]
112
  else:
113
  filtered_df = df
114
+
 
 
 
 
 
 
 
 
 
 
 
115
  sort_by_factuality = st.checkbox('Sort by Factuality Score')
116
 
117
  # Sort the dataframe based on Factuality Score if the checkbox is selected
118
  if sort_by_factuality:
119
  updated_filtered_df = filtered_df.sort_values(
120
+ by=['tier', 'factuality_score'], ascending=[True, False]
121
+ )
122
  else:
123
+ updated_filtered_df = filtered_df.sort_values(
124
+ by=['tier', 'original_order']
125
+ )
126
 
127
  # Create HTML for the table
128
+ if selected_tier == 'All Tiers':
129
+ html = '''
130
+ <table>
131
+ <thead>
132
+ <tr>
133
+ <th>Tier</th>
134
+ <th>Rank</th>
135
+ <th>Model</th>
136
+ <th>Factuality Score</th>
137
+ <th>Factual Recall</th>
138
+ <th>Conceptual Understanding</th>
139
+ <th>Procedural Execution</th>
140
+ <th>Comparative Analysis</th>
141
+ <th>Recommendations and Insights</th>
142
+ <th>Domain-Specific Knowledge</th>
143
+ <th>Temporal Context</th>
144
+ <th>Hallucination Score</th>
145
+ <th># Tokens</th>
146
+ <th># Factual</th>
147
+ <th># Undecidable</th>
148
+ <th># Unsupported</th>
149
+ </tr>
150
+ </thead>
151
+ <tbody>
152
+ '''
153
+ else:
154
+ html = '''
155
+ <table>
156
+ <thead>
157
+ <tr>
158
+ <th>Rank</th>
159
+ <th>Model</th>
160
+ <th>Factuality Score</th>
161
+ <th>Factual Recall</th>
162
+ <th>Conceptual Understanding</th>
163
+ <th>Procedural Execution</th>
164
+ <th>Comparative Analysis</th>
165
+ <th>Recommendations and Insights</th>
166
+ <th>Domain-Specific Knowledge</th>
167
+ <th>Temporal Context</th>
168
+ <th>Hallucination Score</th>
169
+ <th># Tokens</th>
170
+ <th># Factual</th>
171
+ <th># Undecidable</th>
172
+ <th># Unsupported</th>
173
+ </tr>
174
+ </thead>
175
+ <tbody>
176
+ '''
177
 
178
  # Generate the rows of the table
179
  current_tier = None
180
  for i, row in updated_filtered_df.iterrows():
181
+ html += '<tr>'
182
+
183
+ # Only display the 'Tier' column if 'All Tiers' is selected
184
+ if selected_tier == 'All Tiers':
185
+ if row['tier'] != current_tier:
186
+ current_tier = row['tier']
187
+ html += f'<td rowspan="7" style="vertical-align: middle;">{current_tier}</td>'
188
+
 
 
189
  # Fill in model and scores
190
  html += f'''
191
  <td>{row['rank']}</td>
 
192
  <td>{row['model']}</td>
193
+ <td>{row['factuality_score']}</td>
194
+ <td>{row['prompt_categories.Factual Recall']}</td>
195
+ <td>{row['prompt_categories.Conceptual Understanding']}</td>
196
+ <td>{row['prompt_categories.Procedural Execution']}</td>
197
+ <td>{row['prompt_categories.Comparative Analysis']}</td>
198
+ <td>{row['prompt_categories.Recommendations and Insights']}</td>
199
+ <td>{row['prompt_categories.Domain-Specific Knowledge']}</td>
200
+ <td>{row['prompt_categories.Temporal Context']}</td>
201
+ <td>{row['hallucination_score']}</td>
202
+ <td>{row['avg_tokens']}</td>
203
+ <td>{row['avg_factual_units']}</td>
204
  <td>{row['avg_undecidable_units']:.2f}</td>
205
  <td>{row['avg_unsupported_units']:.2f}</td>
 
 
 
 
 
 
 
206
  </tr>
207
  '''
208
 
 
215
  st.markdown(html, unsafe_allow_html=True)
216
 
217
  st.markdown('</div>', unsafe_allow_html=True)
218
+
219
  # Tab 2: Details
220
  with tab2:
221
  st.markdown('<div class="tab-content">', unsafe_allow_html=True)
tiered_models_data.csv CHANGED
@@ -1,23 +1,23 @@
1
  tier,model,factuality_score,hallucination_score,avg_tokens,avg_factual_units,avg_undecidable_units,avg_unsupported_units,prompt_categories.Factual Recall,prompt_categories.Conceptual Understanding,prompt_categories.Procedural Execution,prompt_categories.Comparative Analysis,prompt_categories.Recommendations and Insights,prompt_categories.Domain-Specific Knowledge,prompt_categories.Temporal Context
2
- Tier 1: Easy,gpt4-o,75.69,0.64,561.72,23.91,4.61,1.01,76.49,78.49,66.14,76.13,76.3,75.91,69.52
3
- Tier 1: Easy,gemini,73.81,0.68,516.41,22.23,4.47,1.12,73.35,79.39,66.7,72.44,73.64,74.31,71.42
4
- Tier 1: Easy,llama3.1_70B_instruct,70.01,0.89,531.35,27.09,5.67,2.13,68.99,75.38,64.73,70.34,70.03,70.64,56.61
5
- Tier 1: Easy,llama3.1_405B_instruct,68.64,0.93,550.74,26.6,6.15,2.19,66.07,74.67,65.88,70.18,68.29,70.91,49.97
6
- Tier 1: Easy,claude-3.5-sonnet,74.95,0.65,395.77,22.64,4.03,1.19,74.84,77.74,69.55,74.87,75.3,76.4,64.19
7
- Tier 1: Easy,commandR+,73.15,0.71,440.93,23.55,4.51,1.4,69.41,80.24,68.98,74.36,73.53,73.02,66.43
8
- Tier 1: Easy,mistral-large-2,75.19,0.67,485.58,23.21,4.09,1.36,75.87,78.32,63.98,77.17,75.5,76.38,65.8
9
- Tier 2: Moderate,gpt4-o,80.72,0.5,624.67,24.42,3.59,0.89,80.06,84.33,72.83,79.75,81.5,81.1,70.02
10
- Tier 2: Moderate,gemini,78.02,0.57,565.97,22.16,3.71,0.97,74.13,81.74,73.13,77.32,78.37,80.04,68.03
11
- Tier 2: Moderate,llama3.1_70B_instruct,75.76,0.71,607.44,25.35,4.33,1.76,63.87,77.92,72.94,78.67,79.56,76.83,47.71
12
- Tier 2: Moderate,llama3.1_405B_instruct,75.05,0.7,599.3,25.24,4.74,1.41,67.96,78.09,68.51,76.16,77.31,76.25,65.43
13
- Tier 2: Moderate,claude-3.5-sonnet,79.92,0.54,414.32,22.15,3.32,1.09,75.88,83.52,77.39,79.31,81.06,78.81,72.47
14
- Tier 2: Moderate,commandR+,80.71,0.52,483.32,24.1,3.17,1.09,73.49,85.46,75.6,82.97,82.12,81.61,58.49
15
- Tier 2: Moderate,mistral-large-2,79.97,0.52,528.44,22.65,3.21,1.02,77.21,81.23,75.2,81.24,80.86,82.03,63.63
16
- Tier 3: Hard,gpt4-o,91.63,0.26,640.84,29.29,2.01,0.53,94.31,93.62,82.98,89.19,91.86,94.12
17
- Tier 3: Hard,gemini,89.86,0.31,551.81,25.6,1.88,0.71,92.61,90.34,83.32,87.39,90.93,95.23
18
- Tier 3: Hard,llama3.1_70B_instruct,89.3,0.33,607.75,31.38,2.08,0.83,75.5,91.75,83.61,87.11,93.03,93.08
19
- Tier 3: Hard,llama3.1_405B_instruct,86.57,0.4,599.87,30.12,2.88,0.85,79.58,88.92,75.23,85.11,89.2,90.21,100.0
20
- Tier 3: Hard,claude-3.5-sonnet,89.61,0.3,411.2,26.72,1.49,0.81,89.85,92.45,75.13,86.48,91.46,91.97,100.0
21
- Tier 3: Hard,commandR+,91.65,0.25,499.06,27.95,1.57,0.54,87.71,91.8,87.16,89.79,94.12,93.85,100.0
22
- Tier 3: Hard,mistral-large-2,92.0,0.25,523.57,27.8,1.8,0.55,92.96,92.33,90.58,89.41,92.81,92.41,100.0
23
 
 
1
  tier,model,factuality_score,hallucination_score,avg_tokens,avg_factual_units,avg_undecidable_units,avg_unsupported_units,prompt_categories.Factual Recall,prompt_categories.Conceptual Understanding,prompt_categories.Procedural Execution,prompt_categories.Comparative Analysis,prompt_categories.Recommendations and Insights,prompt_categories.Domain-Specific Knowledge,prompt_categories.Temporal Context
2
+ Tier 1: Hard,GPT4-o,75.69,0.64,561.72,23.91,4.61,1.01,76.49,78.49,66.14,76.13,76.3,75.91,69.52
3
+ Tier 1: Hard,Gemini1.5-Pro,73.81,0.68,516.41,22.23,4.47,1.12,73.35,79.39,66.7,72.44,73.64,74.31,71.42
4
+ Tier 1: Hard,Llama3.1-70B-Instruct,70.01,0.89,531.35,27.09,5.67,2.13,68.99,75.38,64.73,70.34,70.03,70.64,56.61
5
+ Tier 1: Hard,Llama3.1-405B-Instruct,68.64,0.93,550.74,26.6,6.15,2.19,66.07,74.67,65.88,70.18,68.29,70.91,49.97
6
+ Tier 1: Hard,Claude-3.5-Sonnet,74.95,0.65,395.77,22.64,4.03,1.19,74.84,77.74,69.55,74.87,75.3,76.4,64.19
7
+ Tier 1: Hard,CommandR+,73.15,0.71,440.93,23.55,4.51,1.4,69.41,80.24,68.98,74.36,73.53,73.02,66.43
8
+ Tier 1: Hard,Mistral-Large-2,75.19,0.67,485.58,23.21,4.09,1.36,75.87,78.32,63.98,77.17,75.5,76.38,65.8
9
+ Tier 2: Moderate,GPT4-o,80.72,0.5,624.67,24.42,3.59,0.89,80.06,84.33,72.83,79.75,81.5,81.1,70.02
10
+ Tier 2: Moderate,Gemini1.5-Pro,78.02,0.57,565.97,22.16,3.71,0.97,74.13,81.74,73.13,77.32,78.37,80.04,68.03
11
+ Tier 2: Moderate,Llama3.1-70B-Instruct,75.76,0.71,607.44,25.35,4.33,1.76,63.87,77.92,72.94,78.67,79.56,76.83,47.71
12
+ Tier 2: Moderate,Llama3.1-405B-Instruct,75.05,0.7,599.3,25.24,4.74,1.41,67.96,78.09,68.51,76.16,77.31,76.25,65.43
13
+ Tier 2: Moderate,Claude-3.5-Sonnet,79.92,0.54,414.32,22.15,3.32,1.09,75.88,83.52,77.39,79.31,81.06,78.81,72.47
14
+ Tier 2: Moderate,CommandR+,80.71,0.52,483.32,24.1,3.17,1.09,73.49,85.46,75.6,82.97,82.12,81.61,58.49
15
+ Tier 2: Moderate,Mistral-Large-2,79.97,0.52,528.44,22.65,3.21,1.02,77.21,81.23,75.2,81.24,80.86,82.03,63.63
16
+ Tier 3: Easy,GPT4-o,91.63,0.26,640.84,29.29,2.01,0.53,94.31,93.62,82.98,89.19,91.86,94.12
17
+ Tier 3: Easy,Gemini1.5-Pro,89.86,0.31,551.81,25.6,1.88,0.71,92.61,90.34,83.32,87.39,90.93,95.23
18
+ Tier 3: Easy,Llama3.1-70B-Instruct,89.3,0.33,607.75,31.38,2.08,0.83,75.5,91.75,83.61,87.11,93.03,93.08
19
+ Tier 3: Easy,Llama3.1-405B-Instruct,86.57,0.4,599.87,30.12,2.88,0.85,79.58,88.92,75.23,85.11,89.2,90.21,100.0
20
+ Tier 3: Easy,Claude-3.5-Sonnet,89.61,0.3,411.2,26.72,1.49,0.81,89.85,92.45,75.13,86.48,91.46,91.97,100.0
21
+ Tier 3: Easy,CommandR+,91.65,0.25,499.06,27.95,1.57,0.54,87.71,91.8,87.16,89.79,94.12,93.85,100.0
22
+ Tier 3: Easy,Mistral-Large-2,92.0,0.25,523.57,27.8,1.8,0.55,92.96,92.33,90.58,89.41,92.81,92.41,100.0
23