Spaces:

launch
/

ExpertLongBench

Running

File size: 3,548 Bytes

0075c7c

import streamlit as st
import pandas as pd
import numpy as np
from PIL import Image
import base64
from io import BytesIO

# --- Page config ---
st.set_page_config(page_title="VeriFact Leaderboard", layout="wide")

# --- Load images ---
@st.cache_data
def load_image(path):
    return Image.open(path)

logo = load_image("factrbench.png")
chart = load_image("test.png")

# Display logo
buf = BytesIO()
logo.save(buf, format="PNG")
logo_b64 = base64.b64encode(buf.getvalue()).decode("utf-8")
st.markdown(f"""
    <div style="text-align:center; margin-bottom:20px;">
        <img src="data:image/png;base64,{logo_b64}" style="width:50%; max-width:700px;"/>
    </div>
""", unsafe_allow_html=True)

# Header
st.markdown("""
<div style="text-align:center;">
  <p style="font-size:22px;">
    VERIFACT: Enhancing Long-Form Factuality Evaluation...
  </p>
  <p style="font-size:20px;">
    # 📑 <a href="">Paper</a> | 💻 <a href="">GitHub</a> | 🤗 <a href="">HuggingFace</a> | 
    ⚙️ <strong>Version</strong>: <strong>V1</strong> | <strong># Models</strong>: 11 | Updated: <strong>April 2025</strong>
  </p>
</div>
""", unsafe_allow_html=True)

# --- Load data ---
@st.cache_data
def load_data(path="models.json"):
    df = pd.read_json(path, lines=True)
    df["Avg"] = df[[f"T{i}" for i in range(1,12)]].mean(axis=1).round(1)
    # Compute rank per column
    for col in [f"T{i}" for i in range(1,12)] + ["Avg"]:
        df[f"{col}_rank"] = df[col].rank(ascending=False, method="min").astype(int)
    return df

df = load_data()

# --- Tabs ---
tab1, tab2 = st.tabs(["Leaderboard", "Benchmark Details"])

with tab1:
    st.markdown("**Leaderboard:** Higher scores shaded green; best models bolded.")
    # Build HTML table
    cols = ["Model"] + [f"T{i}" for i in range(1,12)] + ["Avg"]
    max_ranks = {col: df[f"{col}_rank"].max() for col in cols if col!="Model"}
    
    html = "<table style='border-collapse:collapse; width:100%;'>"
    # header
    html += "<tr>" + "".join(f"<th style='padding:4px;'>{c}</th>" for c in cols) + "</tr>"
    # rows
    for _, row in df.iterrows():
        html += "<tr>"
        for c in cols:
            val = row[c] if c!="Model" else row[c]
            if c=="Model":
                html += f"<td style='padding:4px;text-align:left;'>{val}</td>"
            else:
                # color gradient
                rank = row[f"{c}_rank"]
                norm = 1 - (rank-1)/(max_ranks[c]-1 or 1)
                # interpolate green-white
                r = int(255 - norm*(255-182))
                g = int(255 - norm*(255-243))
                b = 255
                style = f"background-color:rgb({r},{g},{b}); padding:4px;"
                bold = "font-weight:bold;" if rank==1 else ""
                html += f"<td style='{style}{bold}'>{val}</td>"
        html += "</tr>"
    html += "</table>"
    st.markdown(html, unsafe_allow_html=True)

with tab2:
    buf2 = BytesIO()
    chart.save(buf2, format="PNG")
    chart_b64 = base64.b64encode(buf2.getvalue()).decode("utf-8")
    st.markdown(f"""
        <div style="text-align:center;">
          <img src="data:image/png;base64,{chart_b64}" style="width:65%;"/>
        </div>
    """, unsafe_allow_html=True)
    st.markdown("### What is VERIFACT?")
    st.write("VERIFACT is a factuality evaluation framework...")
    st.markdown("### What is FACTRBENCH?")
    st.write("FACTRBENCH is the first benchmark for long-form factuality evaluation...")
    st.markdown("### Key Findings")
    st.write("VERIFACT outperforms prior methods [...]")