File size: 3,297 Bytes
7f6ca6e eaa1d85 7f6ca6e 4d2570c 7f6ca6e eaa1d85 d6304fe eaa1d85 d6304fe eaa1d85 7f6ca6e f0dd2c2 7f6ca6e eaa1d85 d6304fe eaa1d85 7f6ca6e f0dd2c2 7f6ca6e eaa1d85 d6304fe eaa1d85 7f6ca6e f0dd2c2 7f6ca6e eaa1d85 d6304fe eaa1d85 7f6ca6e f0dd2c2 7f6ca6e eaa1d85 7f6ca6e a4c2338 7f6ca6e a4c2338 7f6ca6e a4c2338 7f6ca6e a4c2338 eaa1d85 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 |
import streamlit as st
import pandas as pd
# CSS样式
st.markdown("""
<style>
h1 {
font-size: 2.5em; /* 标题字体大小 */
}
.stDataFrame {
font-family: Helvetica;
}
.dataframe th, .dataframe td {
width: auto;
min-width: 500px;
}
</style>
""", unsafe_allow_html=True)
# 标题
st.title('🏆AEOLLM Leaderboard')
# 描述
st.markdown("""
This leaderboard is used to show the performance of the **automatic evaluation methods of LLMs** submitted by the **AEOLLM team** on four tasks:
- Dialogue Generation (DG)
- Text Expansion (TE)
- Summary Generation (SG)
- Non-Factoid QA (NFQA)
Details of AEOLLLM can be found at the link: [https://aeollm.github.io/](https://aeollm.github.io/)
""", unsafe_allow_html=True)
# 创建示例数据
# teamId 唯一标识码
DG = {
"teamId": ["baseline1", "baseline2", "baseline3", "baseline4"],
"methods": ["chatglm3-6b", "baichuan2-13b", "chatglm-pro", "gpt-4o-mini"],
"accuracy": [0.5806, 0.5483, 0.6001, 0.6472],
"kendall's tau": [0.3243, 0.1739, 0.3042, 0.4167],
"spearman": [0.3505, 0.1857, 0.3264, 0.4512]
}
df1 = pd.DataFrame(DG)
for col in df1.select_dtypes(include=['float64', 'int64']).columns:
df1[col] = df1[col].apply(lambda x: f"{x:.4f}")
TE = {
"teamId": ["baseline1", "baseline2", "baseline3", "baseline4"],
"methods": ["chatglm3-6b", "baichuan2-13b", "chatglm-pro", "gpt-4o-mini"],
"accuracy": [0.5107, 0.5050, 0.5461, 0.5581],
"kendall's tau": [0.1281, 0.0635, 0.2716, 0.3864],
"spearman": [0.1352, 0.0667, 0.2867, 0.4157]
}
df2 = pd.DataFrame(TE)
for col in df2.select_dtypes(include=['float64', 'int64']).columns:
df2[col] = df2[col].apply(lambda x: f"{x:.4f}")
SG = {
"teamId": ["baseline1", "baseline2", "baseline3", "baseline4"],
"methods": ["chatglm3-6b", "baichuan2-13b", "chatglm-pro", "gpt-4o-mini"],
"accuracy": [0.6504, 0.6014, 0.7162, 0.7441],
"kendall's tau": [0.3957, 0.2688, 0.5092, 0.5001],
"spearman": [0.4188, 0.2817, 0.5403, 0.5405],
}
df3 = pd.DataFrame(SG)
for col in df3.select_dtypes(include=['float64', 'int64']).columns:
df3[col] = df3[col].apply(lambda x: f"{x:.4f}")
NFQA = {
"teamId": ["baseline1", "baseline2", "baseline3", "baseline4"],
"methods": ["chatglm3-6b", "baichuan2-13b", "chatglm-pro", "gpt-4o-mini"],
"accuracy": [0.5935, 0.5817, 0.7000, 0.7203],
"kendall's tau": [0.2332, 0.2389, 0.4440, 0.4235],
"spearman": [0.2443, 0.2492, 0.4630, 0.4511]
}
df4 = pd.DataFrame(NFQA)
for col in df4.select_dtypes(include=['float64', 'int64']).columns:
df4[col] = df4[col].apply(lambda x: f"{x:.4f}")
# 创建标签页
tab1, tab2, tab3, tab4 = st.tabs(["DG", "TE", "SG", "NFQA"])
with tab1:
st.markdown("""Task: Dialogue Generation; Dataset: DialyDialog""", unsafe_allow_html=True)
st.dataframe(df1, use_container_width=True)
with tab2:
st.markdown("""Task: Text Expansion; Dataset: WritingPrompts""", unsafe_allow_html=True)
st.dataframe(df2, use_container_width=True)
with tab3:
st.markdown("""Task: Summary Generation; Dataset: Xsum""", unsafe_allow_html=True)
st.dataframe(df3, use_container_width=True)
with tab4:
st.markdown("""Task: Non-Factoid QA; Dataset: NF_CATS""", unsafe_allow_html=True)
st.dataframe(df4, use_container_width=True)
|