import streamlit as st import pandas as pd from PIL import Image # Set up page config st.set_page_config( page_title="FactBench Leaderboard", layout="wide", # Layout remains wide, but content will be centered ) # Load the image image = Image.open("factEvalSteps.png") # Custom CSS for the page st.markdown( """ """, unsafe_allow_html=True ) # Display title and description st.markdown('
', unsafe_allow_html=True) st.markdown('
FactBench
', unsafe_allow_html=True) st.markdown('
Benchmark for LM Factuality Evaluation
', unsafe_allow_html=True) st.markdown('
', unsafe_allow_html=True) # Load the data # data_path = "factbench_data.csv" data_path = "tiered_models_data.csv" df = pd.read_csv(data_path) # Create tabs tab1, tab2, tab3 = st.tabs( ["Leaderboard", "Benchmark Details", "Submit your models"]) # Tab 1: Leaderboard # with tab1: # st.markdown('
Leaderboard
', # unsafe_allow_html=True) # st.markdown('
', unsafe_allow_html=True) # # Dropdown menu to filter tiers # tiers = ['All Tiers', 'Tier 1: Easy', 'Tier 2: Moderate', 'Tier 3: Hard'] # selected_tier = st.selectbox('Select Tier:', tiers) # # Filter the data based on the selected tier # if selected_tier != 'All Tiers': # filtered_df = df[df['Tier'] == selected_tier] # else: # filtered_df = df # # Create HTML for the table # html = ''' # # # # # # # # # # # # # ''' # # Generate the rows of the table # current_tier = None # for i, row in filtered_df.iterrows(): # if row['Tier'] != current_tier: # if current_tier is not None: # # Close the previous tier row # html += ' ' # current_tier = row['Tier'] # html += f' ' # else: # html += ' ' # # Fill in model and scores # html += f''' # # # # # # # ''' # # Close the last row and table tags # html += ''' #
TierModelFactScoreSAFEFactcheck-GPTVERIFY
{current_tier}
{row['Model']}{row['FactScore']:.2f}{row['SAFE']:.2f}{row['Factcheck-GPT']:.2f}{row['VERIFY']:.2f}
# ''' # # Display the table # st.markdown(html, unsafe_allow_html=True) # st.markdown('
', unsafe_allow_html=True) df['rank'] = df['factuality_score'].rank( ascending=False, method='min').astype(int) with tab1: st.markdown('
Leaderboard
', unsafe_allow_html=True) st.markdown('
', unsafe_allow_html=True) # Dropdown menu to filter tiers tiers = ['All Tiers', 'Tier 1: Easy', 'Tier 2: Moderate', 'Tier 3: Hard'] selected_tier = st.selectbox('Select Tier:', tiers) # Filter the data based on the selected tier if selected_tier != 'All Tiers': filtered_df = df[df['tier'] == selected_tier] else: filtered_df = df # Add sorting functionality for Factuality Score # sort_order = st.radio('Sort by Factuality Score:', # ('Ascending', 'Descending')) # # Sort the dataframe based on Factuality Score # if sort_order == 'Ascending': # filtered_df = filtered_df.sort_values( # by='factuality_score', ascending=True) # else: # filtered_df = filtered_df.sort_values( # by='factuality_score', ascending=False) # Option to sort by Factuality Score in ascending order sort_by_factuality = st.checkbox('Sort by Factuality Score') # Sort the dataframe based on Factuality Score if the checkbox is selected if sort_by_factuality: updated_filtered_df = filtered_df.sort_values( by='factuality_score', ascending=False) else: updated_filtered_df = filtered_df # Create HTML for the table html = ''' ''' # Generate the rows of the table current_tier = None for i, row in updated_filtered_df.iterrows(): # if row['tier'] != current_tier: # if current_tier is not None: # html += ' ' # current_tier = row['tier'] # # 7 models, change this number when more models # html += f' ' # else: # html += ' ' html += ' ' # Fill in model and scores html += f''' ''' # Close the table html += '''
Rank Tier Model Factuality Score Hallucination Score Avg Tokens Avg Factual Units Avg Undecidable Units Avg Unsupported Units Factual Recall Conceptual Understanding Procedural Execution Comparative Analysis Recommendations and Insights Domain-Specific Knowledge Temporal Context
{current_tier}
{row['rank']} {row['tier']} {row['model']} {row['factuality_score']:.2f} {row['hallucination_score']:.2f} {row['avg_tokens']:.2f} {row['avg_factual_units']:.2f} {row['avg_undecidable_units']:.2f} {row['avg_unsupported_units']:.2f} {row['prompt_categories.Factual Recall']:.2f} {row['prompt_categories.Conceptual Understanding']:.2f} {row['prompt_categories.Procedural Execution']:.2f} {row['prompt_categories.Comparative Analysis']:.2f} {row['prompt_categories.Recommendations and Insights']:.2f} {row['prompt_categories.Domain-Specific Knowledge']:.2f} {row['prompt_categories.Temporal Context']:.2f}
''' # Display the table st.markdown(html, unsafe_allow_html=True) st.markdown('
', unsafe_allow_html=True) # Tab 2: Details with tab2: st.markdown('
', unsafe_allow_html=True) st.markdown('
Benchmark Details
', unsafe_allow_html=True) st.image(image, use_column_width=True) st.markdown('### VERIFY: A Pipeline for Factuality Evaluation') st.write( "Language models (LMs) are widely used by an increasing number of users, " "underscoring the challenge of maintaining factual accuracy across a broad range of topics. " "We present VERIFY (Verification and Evidence Retrieval for Factuality evaluation), " "a pipeline to evaluate LMs' factual accuracy in real-world user interactions." ) st.markdown('### Content Categorization') st.write( "VERIFY considers the verifiability of LM-generated content and categorizes content units as " "`supported`, `unsupported`, or `undecidable` based on the retrieved web evidence. " "Importantly, VERIFY's factuality judgments correlate better with human evaluations than existing methods." ) st.markdown('### Hallucination Prompts & FactBench Dataset') st.write( "Using VERIFY, we identify 'hallucination prompts' across diverse topics—those eliciting the highest rates of " "incorrect or unverifiable LM responses. These prompts form FactBench, a dataset of 985 prompts across 213 " "fine-grained topics. Our dataset captures emerging factuality challenges in real-world LM interactions and is " "regularly updated with new prompts." ) st.markdown('
', unsafe_allow_html=True) # Tab 3: Links with tab3: st.markdown('
', unsafe_allow_html=True) st.markdown('
Submit your model information on our Github
', unsafe_allow_html=True) st.markdown( '[Test your model locally!](https://github.com/FarimaFatahi/FactEval)') st.markdown( '[Submit results or issues!](https://github.com/FarimaFatahi/FactEval/issues/new)') st.markdown('
', unsafe_allow_html=True)