Spaces:

shezamunir
/

test-leaderboard

Sleeping

App Files Files Community

sheza munir commited on Oct 10, 2024

Commit

aecfcd3

verified ·

1 Parent(s): 0fc8763

Upload 2 files

Browse files

Files changed (2) hide show

app.py +157 -0
requirements.py +3 -0

app.py ADDED Viewed

	@@ -0,0 +1,157 @@

+import streamlit as st
+import pandas as pd
+# Set up page config for a better look
+st.set_page_config(
+    page_title="FactBench Leaderboard",
+    layout="centered",
+)
+st.markdown(
+    """
+    <style>
+    @import url('https://fonts.googleapis.com/css2?family=Courier+Prime:wght@400&display=swap');
+    html, body, [class*="css"] {
+        font-family: 'Courier Prime', monospace;  /* Command-line font */
+    }
+    .title {
+        font-size: 42px;
+        font-weight: bold;
+        text-align: center;
+        color: #333;
+        margin-bottom: 5px;
+    }
+    .description {
+        font-size: 22px;
+        text-align: center;
+        margin-bottom: 30px;
+        color: #555;
+    }
+    .table-container {
+        margin-top: 20px;
+    }
+    table {
+        width: 100%;  /* Set table to fill width */
+        border-collapse: collapse;  /* Merge cells neatly */
+        border-radius: 10px;  /* Rounded edges */
+        overflow: hidden;  /* Ensure rounded edges are visible */
+    }
+    th, td {
+        padding: 8px;  /* Reduced padding for smaller font */
+        text-align: center;  /* Center-align text */
+        border: 1px solid #ddd;  /* Add borders */
+        font-size: 14px;  /* Smaller font size */
+    }
+    th {
+        background-color: #f2f2f2;  /* Light gray background for header */
+        font-weight: bold;  /* Bold font for headers */
+    }
+    /* Specific column widths */
+    td:nth-child(2), th:nth-child(2) {  /* Wider Model column */
+        width: 30%;  /* Increased width for model column */
+    }
+    td:nth-child(3), th:nth-child(3),
+    td:nth-child(4), th:nth-child(4),
+    td:nth-child(5), th:nth-child(5),
+    td:nth-child(6), th:nth-child(6) {
+        width: 17.5%;  /* Equal width for the rest */
+    }
+    /* Hover effect for table rows */
+    tr:hover {
+        background-color: #eaeaea;  /* Light grey on hover */
+    }
+    </style>
+    """,
+    unsafe_allow_html=True
+)
+# Add title and description
+st.markdown('<div class="title">FactBench Leaderboard</div>',
+            unsafe_allow_html=True)
+st.markdown('<div class="description">Benchmark for LM Factuality Evaluation</div>',
+            unsafe_allow_html=True)
+# Data for all tiers combined
+data = {
+    'Tier': ['Easy', 'Easy', 'Easy', 'Easy',
+             'Moderate', 'Moderate', 'Moderate', 'Moderate',
+             'Hard', 'Hard', 'Hard', 'Hard'],
+    'Model': ['GPT4-o', 'Gemini1.5-Pro', 'Llama3.1-70B-Instruct', 'Llama3.1-405B-Instruct',
+              'GPT4-o', 'Gemini1.5-Pro', 'Llama3.1-70B-Instruct', 'Llama3.1-405B-Instruct',
+              'GPT4-o', 'Gemini1.5-Pro', 'Llama3.1-70B-Instruct', 'Llama3.1-405B-Instruct'],
+    'FactScore': [53.19, 51.79, 52.49, 53.22, 54.76, 52.62, 52.53, 53.48, 69.44, 66.05, 69.85, 70.04],
+    'SAFE': [63.31, 61.24, 61.29, 61.63, 65.01, 62.68, 62.64, 63.29, 76.17, 75.69, 77.55, 77.01],
+    'Factcheck-GPT': [86.4, 83.45, 83.48, 83.57, 89.39, 87.44, 85.16, 86.37, 94.25, 91.09, 92.89, 93.64],
+    'VERIFY': [71.58, 69.38, 67.27, 64.94, 76.02, 74.24, 72.01, 70.25, 90.58, 87.82, 86.63, 85.79]
+}
+# Convert the data to a DataFrame
+df = pd.DataFrame(data)
+# Dropdown menu to filter tiers
+tiers = ['All Tiers', 'Easy', 'Moderate', 'Hard']
+selected_tier = st.selectbox('Select Tier:', tiers)
+# Filter the data based on the selected tier
+if selected_tier != 'All Tiers':
+    filtered_df = df[df['Tier'] == selected_tier]
+else:
+    filtered_df = df
+# Create HTML for the table
+html = '''
+<table>
+    <thead>
+        <tr>
+            <th>Tier</th>
+            <th>Model</th>
+            <th>FactScore</th>
+            <th>SAFE</th>
+            <th>Factcheck-GPT</th>
+            <th>VERIFY</th>
+        </tr>
+    </thead>
+    <tbody>
+'''
+# Generate the rows of the table
+current_tier = None
+for i, row in filtered_df.iterrows():
+    if row['Tier'] != current_tier:
+        if current_tier is not None:
+            # Close the previous tier row
+            html += '    </tr>'
+        current_tier = row['Tier']
+        html += f'    <tr><td rowspan="4" style="vertical-align: middle;">{current_tier}</td>'
+    else:
+        html += '    <tr>'
+    # Fill in model and scores
+    html += f'''
+        <td>{row['Model']}</td>
+        <td>{row['FactScore']:.2f}</td>
+        <td>{row['SAFE']:.2f}</td>
+        <td>{row['Factcheck-GPT']:.2f}</td>
+        <td>{row['VERIFY']:.2f}</td>
+    </tr>
+'''
+# Close the last row and table tags
+html += '''
+</table>
+'''
+# Display
+st.markdown(html, unsafe_allow_html=True)

requirements.py ADDED Viewed

	@@ -0,0 +1,3 @@

+pandas
+streamlit
+scikit-learn == 1.0.2