Spaces:
Running
Running
import streamlit as st | |
import pandas as pd | |
import numpy as np | |
from PIL import Image | |
import base64 | |
from io import BytesIO | |
# --- Page config --- | |
st.set_page_config(page_title="VeriFact Leaderboard", layout="wide") | |
# --- Load images --- | |
def load_image(path): | |
return Image.open(path) | |
logo = load_image("factrbench.png") | |
chart = load_image("test.png") | |
# Display logo | |
buf = BytesIO() | |
logo.save(buf, format="PNG") | |
logo_b64 = base64.b64encode(buf.getvalue()).decode("utf-8") | |
st.markdown(f""" | |
<div style="text-align:center; margin-bottom:20px;"> | |
<img src="data:image/png;base64,{logo_b64}" style="width:50%; max-width:700px;"/> | |
</div> | |
""", unsafe_allow_html=True) | |
# Header | |
st.markdown(""" | |
<div style="text-align:center;"> | |
<p style="font-size:22px;"> | |
VERIFACT: Enhancing Long-Form Factuality Evaluation... | |
</p> | |
<p style="font-size:20px;"> | |
# π <a href="">Paper</a> | π» <a href="">GitHub</a> | π€ <a href="">HuggingFace</a> | | |
βοΈ <strong>Version</strong>: <strong>V1</strong> | <strong># Models</strong>: 11 | Updated: <strong>April 2025</strong> | |
</p> | |
</div> | |
""", unsafe_allow_html=True) | |
# --- Load data --- | |
def load_data(path="models.json"): | |
df = pd.read_json(path, lines=True) | |
df["Avg"] = df[[f"T{i}" for i in range(1,12)]].mean(axis=1).round(1) | |
# Compute rank per column | |
for col in [f"T{i}" for i in range(1,12)] + ["Avg"]: | |
df[f"{col}_rank"] = df[col].rank(ascending=False, method="min").astype(int) | |
return df | |
df = load_data() | |
# --- Tabs --- | |
tab1, tab2 = st.tabs(["Leaderboard", "Benchmark Details"]) | |
with tab1: | |
st.markdown("**Leaderboard:** Higher scores shaded green; best models bolded.") | |
# Build HTML table | |
cols = ["Model"] + [f"T{i}" for i in range(1,12)] + ["Avg"] | |
max_ranks = {col: df[f"{col}_rank"].max() for col in cols if col!="Model"} | |
html = "<table style='border-collapse:collapse; width:100%;'>" | |
# header | |
html += "<tr>" + "".join(f"<th style='padding:4px;'>{c}</th>" for c in cols) + "</tr>" | |
# rows | |
for _, row in df.iterrows(): | |
html += "<tr>" | |
for c in cols: | |
val = row[c] if c!="Model" else row[c] | |
if c=="Model": | |
html += f"<td style='padding:4px;text-align:left;'>{val}</td>" | |
else: | |
# color gradient | |
rank = row[f"{c}_rank"] | |
norm = 1 - (rank-1)/(max_ranks[c]-1 or 1) | |
# interpolate green-white | |
r = int(255 - norm*(255-182)) | |
g = int(255 - norm*(255-243)) | |
b = 255 | |
style = f"background-color:rgb({r},{g},{b}); padding:4px;" | |
bold = "font-weight:bold;" if rank==1 else "" | |
html += f"<td style='{style}{bold}'>{val}</td>" | |
html += "</tr>" | |
html += "</table>" | |
st.markdown(html, unsafe_allow_html=True) | |
with tab2: | |
buf2 = BytesIO() | |
chart.save(buf2, format="PNG") | |
chart_b64 = base64.b64encode(buf2.getvalue()).decode("utf-8") | |
st.markdown(f""" | |
<div style="text-align:center;"> | |
<img src="data:image/png;base64,{chart_b64}" style="width:65%;"/> | |
</div> | |
""", unsafe_allow_html=True) | |
st.markdown("### What is VERIFACT?") | |
st.write("VERIFACT is a factuality evaluation framework...") | |
st.markdown("### What is FACTRBENCH?") | |
st.write("FACTRBENCH is the first benchmark for long-form factuality evaluation...") | |
st.markdown("### Key Findings") | |
st.write("VERIFACT outperforms prior methods [...]") | |