Spaces:
Running
Running
Update src/streamlit_app.py
Browse files- src/streamlit_app.py +3 -3
src/streamlit_app.py
CHANGED
@@ -63,7 +63,7 @@ def load_data(path):
|
|
63 |
|
64 |
|
65 |
# one page description
|
66 |
-
st.markdown("## Leaderboard")
|
67 |
# st.markdown("**Leaderboard:** higher scores shaded green; best models bolded.")
|
68 |
|
69 |
tiers = ['F1', 'Accuracy']
|
@@ -142,7 +142,7 @@ pipeline_image = Image.open("src/pipeline.png")
|
|
142 |
buffered2 = BytesIO()
|
143 |
pipeline_image.save(buffered2, format="PNG")
|
144 |
img_data_pipeline = base64.b64encode(buffered2.getvalue()).decode("utf-8")
|
145 |
-
st.markdown("## Abstract")
|
146 |
st.write(
|
147 |
"""
|
148 |
The paper introduces ExpertLongBench, an expert-level benchmark containing 11 tasks from 9 domains that reflect realistic expert workflows and applications.
|
@@ -159,7 +159,7 @@ We benchmark 11 large language models (LLMs) and analyze components in CLEAR, sh
|
|
159 |
)
|
160 |
|
161 |
|
162 |
-
st.markdown("## Pipeline")
|
163 |
st.markdown(
|
164 |
f"""
|
165 |
<div class="logo-container" style="display:flex; justify-content: center;">
|
|
|
63 |
|
64 |
|
65 |
# one page description
|
66 |
+
st.markdown("## 🏆 Leaderboard")
|
67 |
# st.markdown("**Leaderboard:** higher scores shaded green; best models bolded.")
|
68 |
|
69 |
tiers = ['F1', 'Accuracy']
|
|
|
142 |
buffered2 = BytesIO()
|
143 |
pipeline_image.save(buffered2, format="PNG")
|
144 |
img_data_pipeline = base64.b64encode(buffered2.getvalue()).decode("utf-8")
|
145 |
+
st.markdown("## 🧠 Abstract")
|
146 |
st.write(
|
147 |
"""
|
148 |
The paper introduces ExpertLongBench, an expert-level benchmark containing 11 tasks from 9 domains that reflect realistic expert workflows and applications.
|
|
|
159 |
)
|
160 |
|
161 |
|
162 |
+
st.markdown("## 🧰 Evaluation Pipeline")
|
163 |
st.markdown(
|
164 |
f"""
|
165 |
<div class="logo-container" style="display:flex; justify-content: center;">
|