Spaces:

shezamunir
/

test-leaderboard

Sleeping

App Files Files Community

sheza munir commited on Oct 10, 2024

Commit

1b415b5

verified ·

1 Parent(s): aecfcd3

Updated app.py

Browse files

Files changed (2) hide show

app.py +148 -105
factbench_data.csv +13 -0

app.py CHANGED Viewed

@@ -1,19 +1,24 @@
 import streamlit as st
 import pandas as pd
-# Set up page config for a better look
 st.set_page_config(
     page_title="FactBench Leaderboard",
-    layout="centered",
 )
 st.markdown(
     """
     <style>
     @import url('https://fonts.googleapis.com/css2?family=Courier+Prime:wght@400&display=swap');
     html, body, [class*="css"] {
-        font-family: 'Courier Prime', monospace;  /* Command-line font */
     }
     .title {
@@ -31,127 +36,165 @@ st.markdown(
         color: #555;
     }
-    .table-container {
-        margin-top: 20px;
     }
     table {
-        width: 100%;  /* Set table to fill width */
-        border-collapse: collapse;  /* Merge cells neatly */
-        border-radius: 10px;  /* Rounded edges */
-        overflow: hidden;  /* Ensure rounded edges are visible */
     }
     th, td {
-        padding: 8px;  /* Reduced padding for smaller font */
-        text-align: center;  /* Center-align text */
-        border: 1px solid #ddd;  /* Add borders */
-        font-size: 14px;  /* Smaller font size */
     }
     th {
-        background-color: #f2f2f2;  /* Light gray background for header */
-        font-weight: bold;  /* Bold font for headers */
-    }
-    /* Specific column widths */
-    td:nth-child(2), th:nth-child(2) {  /* Wider Model column */
-        width: 30%;  /* Increased width for model column */
-    }
-    td:nth-child(3), th:nth-child(3),
-    td:nth-child(4), th:nth-child(4),
-    td:nth-child(5), th:nth-child(5),
-    td:nth-child(6), th:nth-child(6) {
-        width: 17.5%;  /* Equal width for the rest */
     }
-    /* Hover effect for table rows */
-    tr:hover {
-        background-color: #eaeaea;  /* Light grey on hover */
     }
     </style>
     """,
     unsafe_allow_html=True
 )
-# Add title and description
-st.markdown('<div class="title">FactBench Leaderboard</div>',
             unsafe_allow_html=True)
 st.markdown('<div class="description">Benchmark for LM Factuality Evaluation</div>',
             unsafe_allow_html=True)
-# Data for all tiers combined
-data = {
-    'Tier': ['Easy', 'Easy', 'Easy', 'Easy',
-             'Moderate', 'Moderate', 'Moderate', 'Moderate',
-             'Hard', 'Hard', 'Hard', 'Hard'],
-    'Model': ['GPT4-o', 'Gemini1.5-Pro', 'Llama3.1-70B-Instruct', 'Llama3.1-405B-Instruct',
-              'GPT4-o', 'Gemini1.5-Pro', 'Llama3.1-70B-Instruct', 'Llama3.1-405B-Instruct',
-              'GPT4-o', 'Gemini1.5-Pro', 'Llama3.1-70B-Instruct', 'Llama3.1-405B-Instruct'],
-    'FactScore': [53.19, 51.79, 52.49, 53.22, 54.76, 52.62, 52.53, 53.48, 69.44, 66.05, 69.85, 70.04],
-    'SAFE': [63.31, 61.24, 61.29, 61.63, 65.01, 62.68, 62.64, 63.29, 76.17, 75.69, 77.55, 77.01],
-    'Factcheck-GPT': [86.4, 83.45, 83.48, 83.57, 89.39, 87.44, 85.16, 86.37, 94.25, 91.09, 92.89, 93.64],
-    'VERIFY': [71.58, 69.38, 67.27, 64.94, 76.02, 74.24, 72.01, 70.25, 90.58, 87.82, 86.63, 85.79]
-}
-# Convert the data to a DataFrame
-df = pd.DataFrame(data)
-# Dropdown menu to filter tiers
-tiers = ['All Tiers', 'Easy', 'Moderate', 'Hard']
-selected_tier = st.selectbox('Select Tier:', tiers)
-# Filter the data based on the selected tier
-if selected_tier != 'All Tiers':
-    filtered_df = df[df['Tier'] == selected_tier]
-else:
-    filtered_df = df
-# Create HTML for the table
-html = '''
-<table>
-    <thead>
-        <tr>
-            <th>Tier</th>
-            <th>Model</th>
-            <th>FactScore</th>
-            <th>SAFE</th>
-            <th>Factcheck-GPT</th>
-            <th>VERIFY</th>
-        </tr>
-    </thead>
-    <tbody>
-'''
-# Generate the rows of the table
-current_tier = None
-for i, row in filtered_df.iterrows():
-    if row['Tier'] != current_tier:
-        if current_tier is not None:
-            # Close the previous tier row
-            html += '    </tr>'
-        current_tier = row['Tier']
-        html += f'    <tr><td rowspan="4" style="vertical-align: middle;">{current_tier}</td>'
     else:
-        html += '    <tr>'
-    # Fill in model and scores
-    html += f'''
-        <td>{row['Model']}</td>
-        <td>{row['FactScore']:.2f}</td>
-        <td>{row['SAFE']:.2f}</td>
-        <td>{row['Factcheck-GPT']:.2f}</td>
-        <td>{row['VERIFY']:.2f}</td>
-    </tr>
-'''
-# Close the last row and table tags
-html += '''
-</table>
-'''
-# Display
-st.markdown(html, unsafe_allow_html=True)

 import streamlit as st
 import pandas as pd
+from PIL import Image
+# Set up page config
 st.set_page_config(
     page_title="FactBench Leaderboard",
+    # layout="wide",  # Layout remains wide, but content will be centered
 )
+# Load the image
+image = Image.open("factEvalSteps.png")
+# Custom CSS for the page
 st.markdown(
     """
     <style>
     @import url('https://fonts.googleapis.com/css2?family=Courier+Prime:wght@400&display=swap');
     html, body, [class*="css"] {
+        font-family: 'Courier Prime', monospace;
     }
     .title {
         color: #555;
     }
+    .container {
+        max-width: 1000px;  /* Set a max-width for the container */
+        margin: 0 auto;  /* Center the container */
+        padding: 20px;
     }
     table {
+        width: 100%;
+        border-collapse: collapse;
+        border-radius: 10px;
+        overflow: hidden;
     }
     th, td {
+        padding: 8px;
+        text-align: center;
+        border: 1px solid #ddd;
+        font-size: 14px;
+        transition: background-color 0.3s;
     }
     th {
+        background-color: #f2f2f2;
+        font-weight: bold;
     }
+    td:hover {
+        background-color: #eaeaea;
     }
     </style>
     """,
     unsafe_allow_html=True
 )
+# Display title and description
+st.markdown('<div class="container">', unsafe_allow_html=True)
+st.markdown('<div class="title">FactBench</div>',
             unsafe_allow_html=True)
 st.markdown('<div class="description">Benchmark for LM Factuality Evaluation</div>',
             unsafe_allow_html=True)
+st.markdown('</div>', unsafe_allow_html=True)
+# Load the data
+data_path = "factbench_data.csv"
+df = pd.read_csv(data_path)
+# Create tabs
+tab1, tab2, tab3 = st.tabs(
+    ["Leaderboard", "Benchmark Details", "Submit your models"])
+# Tab 1: Leaderboard
+with tab1:
+    st.markdown('<div class="title">Leaderboard</div>',
+                unsafe_allow_html=True)
+    st.markdown('<div class="tab-content">', unsafe_allow_html=True)
+    # Dropdown menu to filter tiers
+    tiers = ['All Tiers', 'Tier 1: Easy', 'Tier 2: Moderate', 'Tier 3: Hard']
+    selected_tier = st.selectbox('Select Tier:', tiers)
+    # Filter the data based on the selected tier
+    if selected_tier != 'All Tiers':
+        filtered_df = df[df['Tier'] == selected_tier]
     else:
+        filtered_df = df
+    # Create HTML for the table
+    html = '''
+    <table>
+        <thead>
+            <tr>
+                <th>Tier</th>
+                <th>Model</th>
+                <th>FactScore</th>
+                <th>SAFE</th>
+                <th>Factcheck-GPT</th>
+                <th>VERIFY</th>
+            </tr>
+        </thead>
+        <tbody>
+    '''
+    # Generate the rows of the table
+    current_tier = None
+    for i, row in filtered_df.iterrows():
+        if row['Tier'] != current_tier:
+            if current_tier is not None:
+                # Close the previous tier row
+                html += '    </tr>'
+            current_tier = row['Tier']
+            html += f'    <tr><td rowspan="4" style="vertical-align: middle;">{current_tier}</td>'
+        else:
+            html += '    <tr>'
+        # Fill in model and scores
+        html += f'''
+            <td>{row['Model']}</td>
+            <td>{row['FactScore']:.2f}</td>
+            <td>{row['SAFE']:.2f}</td>
+            <td>{row['Factcheck-GPT']:.2f}</td>
+            <td>{row['VERIFY']:.2f}</td>
+        </tr>
+    '''
+    # Close the last row and table tags
+    html += '''
+    </table>
+    '''
+    # Display the table
+    st.markdown(html, unsafe_allow_html=True)
+    st.markdown('</div>', unsafe_allow_html=True)
+# Tab 2: Details
+with tab2:
+    st.markdown('<div class="tab-content">', unsafe_allow_html=True)
+    st.markdown('<div class="title">Benchmark Details</div>',
+                unsafe_allow_html=True)
+    st.image(image, use_column_width=True)
+    st.markdown('### VERIFY: A Pipeline for Factuality Evaluation')
+    st.write(
+        "Language models (LMs) are widely used by an increasing number of users, "
+        "underscoring the challenge of maintaining factual accuracy across a broad range of topics. "
+        "We present VERIFY (Verification and Evidence Retrieval for Factuality evaluation), "
+        "a pipeline to evaluate LMs' factual accuracy in real-world user interactions."
+    )
+    st.markdown('### Content Categorization')
+    st.write(
+        "VERIFY considers the verifiability of LM-generated content and categorizes content units as "
+        "`supported`, `unsupported`, or `undecidable` based on the retrieved web evidence. "
+        "Importantly, VERIFY's factuality judgments correlate better with human evaluations than existing methods."
+    )
+    st.markdown('### Hallucination Prompts & FactBench Dataset')
+    st.write(
+        "Using VERIFY, we identify 'hallucination prompts' across diverse topics—those eliciting the highest rates of "
+        "incorrect or unverifiable LM responses. These prompts form FactBench, a dataset of 985 prompts across 213 "
+        "fine-grained topics. Our dataset captures emerging factuality challenges in real-world LM interactions and is "
+        "regularly updated with new prompts."
+    )
+    st.markdown('</div>', unsafe_allow_html=True)
+# Tab 3: Links
+with tab3:
+    st.markdown('<div class="tab-content">', unsafe_allow_html=True)
+    st.markdown('<div class="title">Submit your model information on our Github</div>',
+                unsafe_allow_html=True)
+    st.markdown(
+        '[Test your model locally!](https://github.com/FarimaFatahi/FactEval)')
+    st.markdown(
+        '[Submit results or issues!](https://github.com/FarimaFatahi/FactEval/issues/new)')
+    st.markdown('</div>', unsafe_allow_html=True)

factbench_data.csv ADDED Viewed

	@@ -0,0 +1,13 @@

+Tier,Model,FactScore,SAFE,Factcheck-GPT,VERIFY
+Tier 1: Easy,GPT4-o,53.19,63.31,86.4,71.58
+Tier 1: Easy,Gemini1.5-Pro,51.79,61.24,83.45,69.38
+Tier 1: Easy,Llama3.1-70B-Instruct,52.49,61.29,83.48,67.27
+Tier 1: Easy,Llama3.1-405B-Instruct,53.22,61.63,83.57,64.94
+Tier 2: Moderate,GPT4-o,54.76,65.01,89.39,76.02
+Tier 2: Moderate,Gemini1.5-Pro,52.62,62.68,87.44,74.24
+Tier 2: Moderate,Llama3.1-70B-Instruct,52.53,62.64,85.16,72.01
+Tier 2: Moderate,Llama3.1-405B-Instruct,53.48,63.29,86.37,70.25
+Tier 3: Hard,GPT4-o,69.44,76.17,94.25,90.58
+Tier 3: Hard,Gemini1.5-Pro,66.05,75.69,91.09,87.82
+Tier 3: Hard,Llama3.1-70B-Instruct,69.85,77.55,92.89,86.63
+Tier 3: Hard,Llama3.1-405B-Instruct,70.04,77.01,93.64,85.79