File size: 3,348 Bytes
7f6ca6e eaa1d85 7f6ca6e 4d2570c 7f6ca6e eaa1d85 7f6ca6e f0dd2c2 7f6ca6e eaa1d85 7f6ca6e f0dd2c2 7f6ca6e eaa1d85 7f6ca6e f0dd2c2 7f6ca6e eaa1d85 7f6ca6e f0dd2c2 7f6ca6e eaa1d85 7f6ca6e eaa1d85 7f6ca6e a4c2338 7f6ca6e eaa1d85 7f6ca6e a4c2338 7f6ca6e a4c2338 7f6ca6e eaa1d85 7f6ca6e a4c2338 eaa1d85 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 |
import streamlit as st
import pandas as pd
# CSS样式
st.markdown("""
<style>
h1 {
font-size: 2.5em; /* 标题字体大小 */
}
.stDataFrame {
font-family: Helvetica;
}
.dataframe th, .dataframe td {
width: auto;
min-width: 500px;
}
</style>
""", unsafe_allow_html=True)
# 标题
st.title('🏆AEOLLM Leaderboard')
# 描述
st.markdown("""
This leaderboard is used to show the performance of the **automatic evaluation methods of LLMs** submitted by the **AEOLLM team** on four tasks:
- Dialogue Generation (DG)
- Text Expansion (TE)
- Summary Generation (SG)
- Non-Factoid QA (NFQA)
Details of AEOLLLM can be found at the link: [https://aeollm.github.io/](https://aeollm.github.io/)
""", unsafe_allow_html=True)
# 创建示例数据
DG = {
"methods": ["chatglm3-6b", "baichuan2-13b", "chatglm-pro", "gpt-4o-mini"],
"team": ["baseline", "baseline", "baseline", "baseline"],
"accuracy": [0.5806, 0.5483, 0.6001, 0.6472],
"kendall's tau": [0.3243, 0.1739, 0.3042, 0.4167],
"spearman": [0.3505, 0.1857, 0.3264, 0.4512]
}
df1 = pd.DataFrame(DG)
for col in df1.select_dtypes(include=['float64', 'int64']).columns:
df1[col] = df1[col].apply(lambda x: f"{x:.4f}")
TE = {
"methods": ["chatglm3-6b", "baichuan2-13b", "chatglm-pro", "gpt-4o-mini"],
"team": ["baseline", "baseline", "baseline", "baseline"],
"accuracy": [0.5107, 0.5050, 0.5461, 0.5581],
"kendall's tau": [0.1281, 0.0635, 0.2716, 0.3864],
"spearman": [0.1352, 0.0667, 0.2867, 0.4157]
}
df2 = pd.DataFrame(TE)
for col in df2.select_dtypes(include=['float64', 'int64']).columns:
df2[col] = df2[col].apply(lambda x: f"{x:.4f}")
SG = {
"methods": ["chatglm3-6b", "baichuan2-13b", "chatglm-pro", "gpt-4o-mini"],
"team": ["baseline", "baseline", "baseline", "baseline"],
"accuracy": [0.6504, 0.6014, 0.7162, 0.7441],
"kendall's tau": [0.3957, 0.2688, 0.5092, 0.5001],
"spearman": [0.4188, 0.2817, 0.5403, 0.5405],
}
df3 = pd.DataFrame(SG)
for col in df3.select_dtypes(include=['float64', 'int64']).columns:
df3[col] = df3[col].apply(lambda x: f"{x:.4f}")
NFQA = {
"methods": ["chatglm3-6b", "baichuan2-13b", "chatglm-pro", "gpt-4o-mini"],
"team": ["baseline", "baseline", "baseline", "baseline"],
"accuracy": [0.5935, 0.5817, 0.7000, 0.7203],
"kendall's tau": [0.2332, 0.2389, 0.4440, 0.4235],
"spearman": [0.2443, 0.2492, 0.4630, 0.4511]
}
df4 = pd.DataFrame(NFQA)
for col in df4.select_dtypes(include=['float64', 'int64']).columns:
df4[col] = df4[col].apply(lambda x: f"{x:.4f}")
# 创建标签页
tab1, tab2, tab3, tab4 = st.tabs(["DG", "TE", "SG", "NFQA"])
# 在标签页 3 中添加内容
with tab1:
st.markdown("""Task: Dialogue Generation; Dataset: DialyDialog""", unsafe_allow_html=True)
st.dataframe(df1, use_container_width=True)
# 在标签页 4 中添加内容
with tab2:
st.markdown("""Task: Text Expansion; Dataset: WritingPrompts""", unsafe_allow_html=True)
st.dataframe(df2, use_container_width=True)
with tab3:
st.markdown("""Task: Summary Generation; Dataset: Xsum""", unsafe_allow_html=True)
st.dataframe(df3, use_container_width=True)
# 在标签页 2 中添加内容
with tab4:
st.markdown("""Task: Non-Factoid QA; Dataset: NF_CATS""", unsafe_allow_html=True)
st.dataframe(df4, use_container_width=True)
|