sheza munir commited on
Commit
1b415b5
·
verified ·
1 Parent(s): aecfcd3

Updated app.py

Browse files
Files changed (2) hide show
  1. app.py +148 -105
  2. factbench_data.csv +13 -0
app.py CHANGED
@@ -1,19 +1,24 @@
1
  import streamlit as st
2
  import pandas as pd
 
3
 
4
- # Set up page config for a better look
5
  st.set_page_config(
6
  page_title="FactBench Leaderboard",
7
- layout="centered",
8
  )
9
 
 
 
 
 
10
  st.markdown(
11
  """
12
  <style>
13
  @import url('https://fonts.googleapis.com/css2?family=Courier+Prime:wght@400&display=swap');
14
 
15
  html, body, [class*="css"] {
16
- font-family: 'Courier Prime', monospace; /* Command-line font */
17
  }
18
 
19
  .title {
@@ -31,127 +36,165 @@ st.markdown(
31
  color: #555;
32
  }
33
 
34
- .table-container {
35
- margin-top: 20px;
 
 
36
  }
37
 
 
 
38
  table {
39
- width: 100%; /* Set table to fill width */
40
- border-collapse: collapse; /* Merge cells neatly */
41
- border-radius: 10px; /* Rounded edges */
42
- overflow: hidden; /* Ensure rounded edges are visible */
43
  }
44
 
45
  th, td {
46
- padding: 8px; /* Reduced padding for smaller font */
47
- text-align: center; /* Center-align text */
48
- border: 1px solid #ddd; /* Add borders */
49
- font-size: 14px; /* Smaller font size */
 
50
  }
51
 
52
  th {
53
- background-color: #f2f2f2; /* Light gray background for header */
54
- font-weight: bold; /* Bold font for headers */
55
- }
56
-
57
- /* Specific column widths */
58
- td:nth-child(2), th:nth-child(2) { /* Wider Model column */
59
- width: 30%; /* Increased width for model column */
60
- }
61
-
62
- td:nth-child(3), th:nth-child(3),
63
- td:nth-child(4), th:nth-child(4),
64
- td:nth-child(5), th:nth-child(5),
65
- td:nth-child(6), th:nth-child(6) {
66
- width: 17.5%; /* Equal width for the rest */
67
  }
68
 
69
- /* Hover effect for table rows */
70
- tr:hover {
71
- background-color: #eaeaea; /* Light grey on hover */
72
  }
73
-
74
  </style>
75
  """,
76
  unsafe_allow_html=True
77
  )
78
 
79
- # Add title and description
80
- st.markdown('<div class="title">FactBench Leaderboard</div>',
 
81
  unsafe_allow_html=True)
82
  st.markdown('<div class="description">Benchmark for LM Factuality Evaluation</div>',
83
  unsafe_allow_html=True)
 
84
 
85
- # Data for all tiers combined
86
- data = {
87
- 'Tier': ['Easy', 'Easy', 'Easy', 'Easy',
88
- 'Moderate', 'Moderate', 'Moderate', 'Moderate',
89
- 'Hard', 'Hard', 'Hard', 'Hard'],
90
- 'Model': ['GPT4-o', 'Gemini1.5-Pro', 'Llama3.1-70B-Instruct', 'Llama3.1-405B-Instruct',
91
- 'GPT4-o', 'Gemini1.5-Pro', 'Llama3.1-70B-Instruct', 'Llama3.1-405B-Instruct',
92
- 'GPT4-o', 'Gemini1.5-Pro', 'Llama3.1-70B-Instruct', 'Llama3.1-405B-Instruct'],
93
- 'FactScore': [53.19, 51.79, 52.49, 53.22, 54.76, 52.62, 52.53, 53.48, 69.44, 66.05, 69.85, 70.04],
94
- 'SAFE': [63.31, 61.24, 61.29, 61.63, 65.01, 62.68, 62.64, 63.29, 76.17, 75.69, 77.55, 77.01],
95
- 'Factcheck-GPT': [86.4, 83.45, 83.48, 83.57, 89.39, 87.44, 85.16, 86.37, 94.25, 91.09, 92.89, 93.64],
96
- 'VERIFY': [71.58, 69.38, 67.27, 64.94, 76.02, 74.24, 72.01, 70.25, 90.58, 87.82, 86.63, 85.79]
97
- }
98
-
99
- # Convert the data to a DataFrame
100
- df = pd.DataFrame(data)
101
-
102
- # Dropdown menu to filter tiers
103
- tiers = ['All Tiers', 'Easy', 'Moderate', 'Hard']
104
- selected_tier = st.selectbox('Select Tier:', tiers)
105
-
106
- # Filter the data based on the selected tier
107
- if selected_tier != 'All Tiers':
108
- filtered_df = df[df['Tier'] == selected_tier]
109
- else:
110
- filtered_df = df
111
-
112
- # Create HTML for the table
113
- html = '''
114
- <table>
115
- <thead>
116
- <tr>
117
- <th>Tier</th>
118
- <th>Model</th>
119
- <th>FactScore</th>
120
- <th>SAFE</th>
121
- <th>Factcheck-GPT</th>
122
- <th>VERIFY</th>
123
- </tr>
124
- </thead>
125
- <tbody>
126
- '''
127
-
128
- # Generate the rows of the table
129
- current_tier = None
130
- for i, row in filtered_df.iterrows():
131
- if row['Tier'] != current_tier:
132
- if current_tier is not None:
133
- # Close the previous tier row
134
- html += ' </tr>'
135
- current_tier = row['Tier']
136
- html += f' <tr><td rowspan="4" style="vertical-align: middle;">{current_tier}</td>'
137
  else:
138
- html += ' <tr>'
139
-
140
- # Fill in model and scores
141
- html += f'''
142
- <td>{row['Model']}</td>
143
- <td>{row['FactScore']:.2f}</td>
144
- <td>{row['SAFE']:.2f}</td>
145
- <td>{row['Factcheck-GPT']:.2f}</td>
146
- <td>{row['VERIFY']:.2f}</td>
147
- </tr>
148
- '''
149
-
150
- # Close the last row and table tags
151
- html += '''
152
-
153
- </table>
154
- '''
155
-
156
- # Display
157
- st.markdown(html, unsafe_allow_html=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
  import pandas as pd
3
+ from PIL import Image
4
 
5
+ # Set up page config
6
  st.set_page_config(
7
  page_title="FactBench Leaderboard",
8
+ # layout="wide", # Layout remains wide, but content will be centered
9
  )
10
 
11
+ # Load the image
12
+ image = Image.open("factEvalSteps.png")
13
+
14
+ # Custom CSS for the page
15
  st.markdown(
16
  """
17
  <style>
18
  @import url('https://fonts.googleapis.com/css2?family=Courier+Prime:wght@400&display=swap');
19
 
20
  html, body, [class*="css"] {
21
+ font-family: 'Courier Prime', monospace;
22
  }
23
 
24
  .title {
 
36
  color: #555;
37
  }
38
 
39
+ .container {
40
+ max-width: 1000px; /* Set a max-width for the container */
41
+ margin: 0 auto; /* Center the container */
42
+ padding: 20px;
43
  }
44
 
45
+
46
+
47
  table {
48
+ width: 100%;
49
+ border-collapse: collapse;
50
+ border-radius: 10px;
51
+ overflow: hidden;
52
  }
53
 
54
  th, td {
55
+ padding: 8px;
56
+ text-align: center;
57
+ border: 1px solid #ddd;
58
+ font-size: 14px;
59
+ transition: background-color 0.3s;
60
  }
61
 
62
  th {
63
+ background-color: #f2f2f2;
64
+ font-weight: bold;
 
 
 
 
 
 
 
 
 
 
 
 
65
  }
66
 
67
+ td:hover {
68
+ background-color: #eaeaea;
 
69
  }
 
70
  </style>
71
  """,
72
  unsafe_allow_html=True
73
  )
74
 
75
+ # Display title and description
76
+ st.markdown('<div class="container">', unsafe_allow_html=True)
77
+ st.markdown('<div class="title">FactBench</div>',
78
  unsafe_allow_html=True)
79
  st.markdown('<div class="description">Benchmark for LM Factuality Evaluation</div>',
80
  unsafe_allow_html=True)
81
+ st.markdown('</div>', unsafe_allow_html=True)
82
 
83
+ # Load the data
84
+ data_path = "factbench_data.csv"
85
+ df = pd.read_csv(data_path)
86
+
87
+ # Create tabs
88
+ tab1, tab2, tab3 = st.tabs(
89
+ ["Leaderboard", "Benchmark Details", "Submit your models"])
90
+
91
+ # Tab 1: Leaderboard
92
+ with tab1:
93
+ st.markdown('<div class="title">Leaderboard</div>',
94
+ unsafe_allow_html=True)
95
+ st.markdown('<div class="tab-content">', unsafe_allow_html=True)
96
+
97
+ # Dropdown menu to filter tiers
98
+ tiers = ['All Tiers', 'Tier 1: Easy', 'Tier 2: Moderate', 'Tier 3: Hard']
99
+ selected_tier = st.selectbox('Select Tier:', tiers)
100
+
101
+ # Filter the data based on the selected tier
102
+ if selected_tier != 'All Tiers':
103
+ filtered_df = df[df['Tier'] == selected_tier]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
  else:
105
+ filtered_df = df
106
+
107
+ # Create HTML for the table
108
+ html = '''
109
+ <table>
110
+ <thead>
111
+ <tr>
112
+ <th>Tier</th>
113
+ <th>Model</th>
114
+ <th>FactScore</th>
115
+ <th>SAFE</th>
116
+ <th>Factcheck-GPT</th>
117
+ <th>VERIFY</th>
118
+ </tr>
119
+ </thead>
120
+ <tbody>
121
+ '''
122
+
123
+ # Generate the rows of the table
124
+ current_tier = None
125
+ for i, row in filtered_df.iterrows():
126
+ if row['Tier'] != current_tier:
127
+ if current_tier is not None:
128
+ # Close the previous tier row
129
+ html += ' </tr>'
130
+ current_tier = row['Tier']
131
+ html += f' <tr><td rowspan="4" style="vertical-align: middle;">{current_tier}</td>'
132
+ else:
133
+ html += ' <tr>'
134
+
135
+ # Fill in model and scores
136
+ html += f'''
137
+ <td>{row['Model']}</td>
138
+ <td>{row['FactScore']:.2f}</td>
139
+ <td>{row['SAFE']:.2f}</td>
140
+ <td>{row['Factcheck-GPT']:.2f}</td>
141
+ <td>{row['VERIFY']:.2f}</td>
142
+ </tr>
143
+ '''
144
+
145
+ # Close the last row and table tags
146
+ html += '''
147
+ </table>
148
+ '''
149
+
150
+ # Display the table
151
+ st.markdown(html, unsafe_allow_html=True)
152
+
153
+ st.markdown('</div>', unsafe_allow_html=True)
154
+
155
+ # Tab 2: Details
156
+ with tab2:
157
+ st.markdown('<div class="tab-content">', unsafe_allow_html=True)
158
+
159
+ st.markdown('<div class="title">Benchmark Details</div>',
160
+ unsafe_allow_html=True)
161
+ st.image(image, use_column_width=True)
162
+
163
+ st.markdown('### VERIFY: A Pipeline for Factuality Evaluation')
164
+ st.write(
165
+ "Language models (LMs) are widely used by an increasing number of users, "
166
+ "underscoring the challenge of maintaining factual accuracy across a broad range of topics. "
167
+ "We present VERIFY (Verification and Evidence Retrieval for Factuality evaluation), "
168
+ "a pipeline to evaluate LMs' factual accuracy in real-world user interactions."
169
+ )
170
+
171
+ st.markdown('### Content Categorization')
172
+ st.write(
173
+ "VERIFY considers the verifiability of LM-generated content and categorizes content units as "
174
+ "`supported`, `unsupported`, or `undecidable` based on the retrieved web evidence. "
175
+ "Importantly, VERIFY's factuality judgments correlate better with human evaluations than existing methods."
176
+ )
177
+
178
+ st.markdown('### Hallucination Prompts & FactBench Dataset')
179
+ st.write(
180
+ "Using VERIFY, we identify 'hallucination prompts' across diverse topics—those eliciting the highest rates of "
181
+ "incorrect or unverifiable LM responses. These prompts form FactBench, a dataset of 985 prompts across 213 "
182
+ "fine-grained topics. Our dataset captures emerging factuality challenges in real-world LM interactions and is "
183
+ "regularly updated with new prompts."
184
+ )
185
+
186
+ st.markdown('</div>', unsafe_allow_html=True)
187
+
188
+ # Tab 3: Links
189
+ with tab3:
190
+ st.markdown('<div class="tab-content">', unsafe_allow_html=True)
191
+
192
+ st.markdown('<div class="title">Submit your model information on our Github</div>',
193
+ unsafe_allow_html=True)
194
+
195
+ st.markdown(
196
+ '[Test your model locally!](https://github.com/FarimaFatahi/FactEval)')
197
+ st.markdown(
198
+ '[Submit results or issues!](https://github.com/FarimaFatahi/FactEval/issues/new)')
199
+
200
+ st.markdown('</div>', unsafe_allow_html=True)
factbench_data.csv ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Tier,Model,FactScore,SAFE,Factcheck-GPT,VERIFY
2
+ Tier 1: Easy,GPT4-o,53.19,63.31,86.4,71.58
3
+ Tier 1: Easy,Gemini1.5-Pro,51.79,61.24,83.45,69.38
4
+ Tier 1: Easy,Llama3.1-70B-Instruct,52.49,61.29,83.48,67.27
5
+ Tier 1: Easy,Llama3.1-405B-Instruct,53.22,61.63,83.57,64.94
6
+ Tier 2: Moderate,GPT4-o,54.76,65.01,89.39,76.02
7
+ Tier 2: Moderate,Gemini1.5-Pro,52.62,62.68,87.44,74.24
8
+ Tier 2: Moderate,Llama3.1-70B-Instruct,52.53,62.64,85.16,72.01
9
+ Tier 2: Moderate,Llama3.1-405B-Instruct,53.48,63.29,86.37,70.25
10
+ Tier 3: Hard,GPT4-o,69.44,76.17,94.25,90.58
11
+ Tier 3: Hard,Gemini1.5-Pro,66.05,75.69,91.09,87.82
12
+ Tier 3: Hard,Llama3.1-70B-Instruct,69.85,77.55,92.89,86.63
13
+ Tier 3: Hard,Llama3.1-405B-Instruct,70.04,77.01,93.64,85.79