Spaces:

launch
/

ExpertLongBench

Running

App Files Files Community

shezamunir commited on May 15

Commit

35c36b4

verified ·

1 Parent(s): 72b06da

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +101 -38

src/streamlit_app.py CHANGED Viewed

@@ -1,40 +1,103 @@
-import altair as alt
-import numpy as np
-import pandas as pd
 import streamlit as st
-"""
-# Welcome to Streamlit!
-Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
-If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
-forums](https://discuss.streamlit.io).
-In the meantime, below is an example of what you can do with just a few lines of code:
-"""
-num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
-num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
-indices = np.linspace(0, 1, num_points)
-theta = 2 * np.pi * num_turns * indices
-radius = indices
-x = radius * np.cos(theta)
-y = radius * np.sin(theta)
-df = pd.DataFrame({
-    "x": x,
-    "y": y,
-    "idx": indices,
-    "rand": np.random.randn(num_points),
-})
-st.altair_chart(alt.Chart(df, height=700, width=700)
-    .mark_point(filled=True)
-    .encode(
-        x=alt.X("x", axis=None),
-        y=alt.Y("y", axis=None),
-        color=alt.Color("idx", legend=None, scale=alt.Scale()),
-        size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
-    ))

 import streamlit as st
+import pandas as pd
+import numpy as np
+from PIL import Image
+import base64
+from io import BytesIO
+# --- Page config ---
+st.set_page_config(page_title="VeriFact Leaderboard", layout="wide")
+# --- Load images ---
+@st.cache_data
+def load_image(path):
+    return Image.open(path)
+logo = load_image("factrbench.png")
+chart = load_image("test.png")
+# Display logo
+buf = BytesIO()
+logo.save(buf, format="PNG")
+logo_b64 = base64.b64encode(buf.getvalue()).decode("utf-8")
+st.markdown(f"""
+    <div style="text-align:center; margin-bottom:20px;">
+        <img src="data:image/png;base64,{logo_b64}" style="width:50%; max-width:700px;"/>
+    </div>
+""", unsafe_allow_html=True)
+# Header
+st.markdown("""
+<div style="text-align:center;">
+  <p style="font-size:22px;">
+    VERIFACT: Enhancing Long-Form Factuality Evaluation...
+  </p>
+  <p style="font-size:20px;">
+    # 📑 <a href="">Paper</a> | 💻 <a href="">GitHub</a> | 🤗 <a href="">HuggingFace</a> |
+    ⚙️ <strong>Version</strong>: <strong>V1</strong> | <strong># Models</strong>: 11 | Updated: <strong>April 2025</strong>
+  </p>
+</div>
+""", unsafe_allow_html=True)
+# --- Load data ---
+@st.cache_data
+def load_data(path="models.json"):
+    df = pd.read_json(path, lines=True)
+    df["Avg"] = df[[f"T{i}" for i in range(1,12)]].mean(axis=1).round(1)
+    # Compute rank per column
+    for col in [f"T{i}" for i in range(1,12)] + ["Avg"]:
+        df[f"{col}_rank"] = df[col].rank(ascending=False, method="min").astype(int)
+    return df
+df = load_data()
+# --- Tabs ---
+tab1, tab2 = st.tabs(["Leaderboard", "Benchmark Details"])
+with tab1:
+    st.markdown("**Leaderboard:** Higher scores shaded green; best models bolded.")
+    # Build HTML table
+    cols = ["Model"] + [f"T{i}" for i in range(1,12)] + ["Avg"]
+    max_ranks = {col: df[f"{col}_rank"].max() for col in cols if col!="Model"}
+    html = "<table style='border-collapse:collapse; width:100%;'>"
+    # header
+    html += "<tr>" + "".join(f"<th style='padding:4px;'>{c}</th>" for c in cols) + "</tr>"
+    # rows
+    for _, row in df.iterrows():
+        html += "<tr>"
+        for c in cols:
+            val = row[c] if c!="Model" else row[c]
+            if c=="Model":
+                html += f"<td style='padding:4px;text-align:left;'>{val}</td>"
+            else:
+                # color gradient
+                rank = row[f"{c}_rank"]
+                norm = 1 - (rank-1)/(max_ranks[c]-1 or 1)
+                # interpolate green-white
+                r = int(255 - norm*(255-182))
+                g = int(255 - norm*(255-243))
+                b = 255
+                style = f"background-color:rgb({r},{g},{b}); padding:4px;"
+                bold = "font-weight:bold;" if rank==1 else ""
+                html += f"<td style='{style}{bold}'>{val}</td>"
+        html += "</tr>"
+    html += "</table>"
+    st.markdown(html, unsafe_allow_html=True)
+with tab2:
+    buf2 = BytesIO()
+    chart.save(buf2, format="PNG")
+    chart_b64 = base64.b64encode(buf2.getvalue()).decode("utf-8")
+    st.markdown(f"""
+        <div style="text-align:center;">
+          <img src="data:image/png;base64,{chart_b64}" style="width:65%;"/>
+        </div>
+    """, unsafe_allow_html=True)
+    st.markdown("### What is VERIFACT?")
+    st.write("VERIFACT is a factuality evaluation framework...")
+    st.markdown("### What is FACTRBENCH?")
+    st.write("FACTRBENCH is the first benchmark for long-form factuality evaluation...")
+    st.markdown("### Key Findings")
+    st.write("VERIFACT outperforms prior methods [...]")