Spaces:
Running
Running
Update src/streamlit_app.py
Browse files- src/streamlit_app.py +101 -38
src/streamlit_app.py
CHANGED
@@ -1,40 +1,103 @@
|
|
1 |
-
import altair as alt
|
2 |
-
import numpy as np
|
3 |
-
import pandas as pd
|
4 |
import streamlit as st
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
|
6 |
-
"""
|
7 |
-
# Welcome to Streamlit!
|
8 |
-
|
9 |
-
Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
|
10 |
-
If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
|
11 |
-
forums](https://discuss.streamlit.io).
|
12 |
-
|
13 |
-
In the meantime, below is an example of what you can do with just a few lines of code:
|
14 |
-
"""
|
15 |
-
|
16 |
-
num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
|
17 |
-
num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
|
18 |
-
|
19 |
-
indices = np.linspace(0, 1, num_points)
|
20 |
-
theta = 2 * np.pi * num_turns * indices
|
21 |
-
radius = indices
|
22 |
-
|
23 |
-
x = radius * np.cos(theta)
|
24 |
-
y = radius * np.sin(theta)
|
25 |
-
|
26 |
-
df = pd.DataFrame({
|
27 |
-
"x": x,
|
28 |
-
"y": y,
|
29 |
-
"idx": indices,
|
30 |
-
"rand": np.random.randn(num_points),
|
31 |
-
})
|
32 |
-
|
33 |
-
st.altair_chart(alt.Chart(df, height=700, width=700)
|
34 |
-
.mark_point(filled=True)
|
35 |
-
.encode(
|
36 |
-
x=alt.X("x", axis=None),
|
37 |
-
y=alt.Y("y", axis=None),
|
38 |
-
color=alt.Color("idx", legend=None, scale=alt.Scale()),
|
39 |
-
size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
|
40 |
-
))
|
|
|
|
|
|
|
|
|
1 |
import streamlit as st
|
2 |
+
import pandas as pd
|
3 |
+
import numpy as np
|
4 |
+
from PIL import Image
|
5 |
+
import base64
|
6 |
+
from io import BytesIO
|
7 |
+
|
8 |
+
# --- Page config ---
|
9 |
+
st.set_page_config(page_title="VeriFact Leaderboard", layout="wide")
|
10 |
+
|
11 |
+
# --- Load images ---
|
12 |
+
@st.cache_data
|
13 |
+
def load_image(path):
|
14 |
+
return Image.open(path)
|
15 |
+
|
16 |
+
logo = load_image("factrbench.png")
|
17 |
+
chart = load_image("test.png")
|
18 |
+
|
19 |
+
# Display logo
|
20 |
+
buf = BytesIO()
|
21 |
+
logo.save(buf, format="PNG")
|
22 |
+
logo_b64 = base64.b64encode(buf.getvalue()).decode("utf-8")
|
23 |
+
st.markdown(f"""
|
24 |
+
<div style="text-align:center; margin-bottom:20px;">
|
25 |
+
<img src="data:image/png;base64,{logo_b64}" style="width:50%; max-width:700px;"/>
|
26 |
+
</div>
|
27 |
+
""", unsafe_allow_html=True)
|
28 |
+
|
29 |
+
# Header
|
30 |
+
st.markdown("""
|
31 |
+
<div style="text-align:center;">
|
32 |
+
<p style="font-size:22px;">
|
33 |
+
VERIFACT: Enhancing Long-Form Factuality Evaluation...
|
34 |
+
</p>
|
35 |
+
<p style="font-size:20px;">
|
36 |
+
# π <a href="">Paper</a> | π» <a href="">GitHub</a> | π€ <a href="">HuggingFace</a> |
|
37 |
+
βοΈ <strong>Version</strong>: <strong>V1</strong> | <strong># Models</strong>: 11 | Updated: <strong>April 2025</strong>
|
38 |
+
</p>
|
39 |
+
</div>
|
40 |
+
""", unsafe_allow_html=True)
|
41 |
+
|
42 |
+
# --- Load data ---
|
43 |
+
@st.cache_data
|
44 |
+
def load_data(path="models.json"):
|
45 |
+
df = pd.read_json(path, lines=True)
|
46 |
+
df["Avg"] = df[[f"T{i}" for i in range(1,12)]].mean(axis=1).round(1)
|
47 |
+
# Compute rank per column
|
48 |
+
for col in [f"T{i}" for i in range(1,12)] + ["Avg"]:
|
49 |
+
df[f"{col}_rank"] = df[col].rank(ascending=False, method="min").astype(int)
|
50 |
+
return df
|
51 |
+
|
52 |
+
df = load_data()
|
53 |
+
|
54 |
+
# --- Tabs ---
|
55 |
+
tab1, tab2 = st.tabs(["Leaderboard", "Benchmark Details"])
|
56 |
+
|
57 |
+
with tab1:
|
58 |
+
st.markdown("**Leaderboard:** Higher scores shaded green; best models bolded.")
|
59 |
+
# Build HTML table
|
60 |
+
cols = ["Model"] + [f"T{i}" for i in range(1,12)] + ["Avg"]
|
61 |
+
max_ranks = {col: df[f"{col}_rank"].max() for col in cols if col!="Model"}
|
62 |
+
|
63 |
+
html = "<table style='border-collapse:collapse; width:100%;'>"
|
64 |
+
# header
|
65 |
+
html += "<tr>" + "".join(f"<th style='padding:4px;'>{c}</th>" for c in cols) + "</tr>"
|
66 |
+
# rows
|
67 |
+
for _, row in df.iterrows():
|
68 |
+
html += "<tr>"
|
69 |
+
for c in cols:
|
70 |
+
val = row[c] if c!="Model" else row[c]
|
71 |
+
if c=="Model":
|
72 |
+
html += f"<td style='padding:4px;text-align:left;'>{val}</td>"
|
73 |
+
else:
|
74 |
+
# color gradient
|
75 |
+
rank = row[f"{c}_rank"]
|
76 |
+
norm = 1 - (rank-1)/(max_ranks[c]-1 or 1)
|
77 |
+
# interpolate green-white
|
78 |
+
r = int(255 - norm*(255-182))
|
79 |
+
g = int(255 - norm*(255-243))
|
80 |
+
b = 255
|
81 |
+
style = f"background-color:rgb({r},{g},{b}); padding:4px;"
|
82 |
+
bold = "font-weight:bold;" if rank==1 else ""
|
83 |
+
html += f"<td style='{style}{bold}'>{val}</td>"
|
84 |
+
html += "</tr>"
|
85 |
+
html += "</table>"
|
86 |
+
st.markdown(html, unsafe_allow_html=True)
|
87 |
+
|
88 |
+
with tab2:
|
89 |
+
buf2 = BytesIO()
|
90 |
+
chart.save(buf2, format="PNG")
|
91 |
+
chart_b64 = base64.b64encode(buf2.getvalue()).decode("utf-8")
|
92 |
+
st.markdown(f"""
|
93 |
+
<div style="text-align:center;">
|
94 |
+
<img src="data:image/png;base64,{chart_b64}" style="width:65%;"/>
|
95 |
+
</div>
|
96 |
+
""", unsafe_allow_html=True)
|
97 |
+
st.markdown("### What is VERIFACT?")
|
98 |
+
st.write("VERIFACT is a factuality evaluation framework...")
|
99 |
+
st.markdown("### What is FACTRBENCH?")
|
100 |
+
st.write("FACTRBENCH is the first benchmark for long-form factuality evaluation...")
|
101 |
+
st.markdown("### Key Findings")
|
102 |
+
st.write("VERIFACT outperforms prior methods [...]")
|
103 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|