import streamlit as st
import pandas as pd
from PIL import Image
# Set up page config
st.set_page_config(
page_title="FactBench Leaderboard",
layout="wide", # Layout remains wide, but content will be centered
)
# Load the image
image = Image.open("factEvalSteps.png")
# Custom CSS for the page
st.markdown(
"""
""",
unsafe_allow_html=True
)
# Display title and description
st.markdown('
', unsafe_allow_html=True)
st.markdown('
FactBench
',
unsafe_allow_html=True)
st.markdown('
Benchmark for LM Factuality Evaluation
',
unsafe_allow_html=True)
st.markdown('
', unsafe_allow_html=True)
# Load the data
# data_path = "factbench_data.csv"
data_path = "tiered_models_data.csv"
df = pd.read_csv(data_path)
# Create tabs
tab1, tab2, tab3 = st.tabs(
["Leaderboard", "Benchmark Details", "Submit your models"])
# Tab 1: Leaderboard
# with tab1:
# st.markdown('Leaderboard
',
# unsafe_allow_html=True)
# st.markdown('', unsafe_allow_html=True)
# # Dropdown menu to filter tiers
# tiers = ['All Tiers', 'Tier 1: Easy', 'Tier 2: Moderate', 'Tier 3: Hard']
# selected_tier = st.selectbox('Select Tier:', tiers)
# # Filter the data based on the selected tier
# if selected_tier != 'All Tiers':
# filtered_df = df[df['Tier'] == selected_tier]
# else:
# filtered_df = df
# # Create HTML for the table
# html = '''
#
#
#
# Tier |
# Model |
# FactScore |
# SAFE |
# Factcheck-GPT |
# VERIFY |
#
#
#
# '''
# # Generate the rows of the table
# current_tier = None
# for i, row in filtered_df.iterrows():
# if row['Tier'] != current_tier:
# if current_tier is not None:
# # Close the previous tier row
# html += ' '
# current_tier = row['Tier']
# html += f' {current_tier} | '
# else:
# html += '
'
# # Fill in model and scores
# html += f'''
# {row['Model']} |
# {row['FactScore']:.2f} |
# {row['SAFE']:.2f} |
# {row['Factcheck-GPT']:.2f} |
# {row['VERIFY']:.2f} |
#
# '''
# # Close the last row and table tags
# html += '''
#
# '''
# # Display the table
# st.markdown(html, unsafe_allow_html=True)
# st.markdown('
', unsafe_allow_html=True)
df['rank'] = df['factuality_score'].rank(
ascending=False, method='min').astype(int)
with tab1:
st.markdown('Leaderboard
', unsafe_allow_html=True)
st.markdown('', unsafe_allow_html=True)
# Dropdown menu to filter tiers
tiers = ['All Tiers', 'Tier 1: Easy', 'Tier 2: Moderate', 'Tier 3: Hard']
selected_tier = st.selectbox('Select Tier:', tiers)
# Filter the data based on the selected tier
if selected_tier != 'All Tiers':
filtered_df = df[df['tier'] == selected_tier]
else:
filtered_df = df
# Add sorting functionality for Factuality Score
# sort_order = st.radio('Sort by Factuality Score:',
# ('Ascending', 'Descending'))
# # Sort the dataframe based on Factuality Score
# if sort_order == 'Ascending':
# filtered_df = filtered_df.sort_values(
# by='factuality_score', ascending=True)
# else:
# filtered_df = filtered_df.sort_values(
# by='factuality_score', ascending=False)
# Option to sort by Factuality Score in ascending order
sort_by_factuality = st.checkbox('Sort by Factuality Score')
# Sort the dataframe based on Factuality Score if the checkbox is selected
if sort_by_factuality:
updated_filtered_df = filtered_df.sort_values(
by='factuality_score', ascending=False)
else:
updated_filtered_df = filtered_df
# Create HTML for the table
html = '''
Rank |
Tier |
Model |
Factuality Score |
Hallucination Score |
Avg Tokens |
Avg Factual Units |
Avg Undecidable Units |
Avg Unsupported Units |
Factual Recall |
Conceptual Understanding |
Procedural Execution |
Comparative Analysis |
Recommendations and Insights |
Domain-Specific Knowledge |
Temporal Context |
'''
# Generate the rows of the table
current_tier = None
for i, row in updated_filtered_df.iterrows():
# if row['tier'] != current_tier:
# if current_tier is not None:
# html += ' '
# current_tier = row['tier']
# # 7 models, change this number when more models
# html += f' {current_tier} | '
# else:
# html += '
'
html += '
'
# Fill in model and scores
html += f'''
{row['rank']} |
{row['tier']} |
{row['model']} |
{row['factuality_score']:.2f} |
{row['hallucination_score']:.2f} |
{row['avg_tokens']:.2f} |
{row['avg_factual_units']:.2f} |
{row['avg_undecidable_units']:.2f} |
{row['avg_unsupported_units']:.2f} |
{row['prompt_categories.Factual Recall']:.2f} |
{row['prompt_categories.Conceptual Understanding']:.2f} |
{row['prompt_categories.Procedural Execution']:.2f} |
{row['prompt_categories.Comparative Analysis']:.2f} |
{row['prompt_categories.Recommendations and Insights']:.2f} |
{row['prompt_categories.Domain-Specific Knowledge']:.2f} |
{row['prompt_categories.Temporal Context']:.2f} |
'''
# Close the table
html += '''
'''
# Display the table
st.markdown(html, unsafe_allow_html=True)
st.markdown('
', unsafe_allow_html=True)
# Tab 2: Details
with tab2:
st.markdown('', unsafe_allow_html=True)
st.markdown('
Benchmark Details
',
unsafe_allow_html=True)
st.image(image, use_column_width=True)
st.markdown('### VERIFY: A Pipeline for Factuality Evaluation')
st.write(
"Language models (LMs) are widely used by an increasing number of users, "
"underscoring the challenge of maintaining factual accuracy across a broad range of topics. "
"We present VERIFY (Verification and Evidence Retrieval for Factuality evaluation), "
"a pipeline to evaluate LMs' factual accuracy in real-world user interactions."
)
st.markdown('### Content Categorization')
st.write(
"VERIFY considers the verifiability of LM-generated content and categorizes content units as "
"`supported`, `unsupported`, or `undecidable` based on the retrieved web evidence. "
"Importantly, VERIFY's factuality judgments correlate better with human evaluations than existing methods."
)
st.markdown('### Hallucination Prompts & FactBench Dataset')
st.write(
"Using VERIFY, we identify 'hallucination prompts' across diverse topics—those eliciting the highest rates of "
"incorrect or unverifiable LM responses. These prompts form FactBench, a dataset of 985 prompts across 213 "
"fine-grained topics. Our dataset captures emerging factuality challenges in real-world LM interactions and is "
"regularly updated with new prompts."
)
st.markdown('
', unsafe_allow_html=True)
# Tab 3: Links
with tab3:
st.markdown('', unsafe_allow_html=True)
st.markdown('
Submit your model information on our Github
',
unsafe_allow_html=True)
st.markdown(
'[Test your model locally!](https://github.com/FarimaFatahi/FactEval)')
st.markdown(
'[Submit results or issues!](https://github.com/FarimaFatahi/FactEval/issues/new)')
st.markdown('
', unsafe_allow_html=True)