Spaces:

THUIR
/

AEOLLM

Running

File size: 3,297 Bytes

7f6ca6e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eaa1d85
 
7f6ca6e
 
 
4d2570c
7f6ca6e
 
eaa1d85
d6304fe
eaa1d85
d6304fe
eaa1d85
 
 
 
7f6ca6e
f0dd2c2
 
 
 
7f6ca6e
eaa1d85
d6304fe
eaa1d85
 
 
 
7f6ca6e
f0dd2c2
 
 
7f6ca6e
eaa1d85
d6304fe
eaa1d85
 
 
 
7f6ca6e
f0dd2c2
 
 
7f6ca6e
eaa1d85
d6304fe
eaa1d85
 
 
 
7f6ca6e
f0dd2c2
 
 
7f6ca6e
 
eaa1d85
7f6ca6e
 
a4c2338
7f6ca6e
 
 
a4c2338
7f6ca6e
 
 
a4c2338
7f6ca6e
 
 
a4c2338
eaa1d85

import streamlit as st
import pandas as pd

# CSS样式
st.markdown("""
<style>
h1 {
    font-size: 2.5em;  /* 标题字体大小 */
}
.stDataFrame {
    font-family: Helvetica;
}
.dataframe th, .dataframe td {
    width: auto;
    min-width: 500px; 
}
</style>
""", unsafe_allow_html=True)

# 标题
st.title('🏆AEOLLM Leaderboard')

# 描述
st.markdown("""
This leaderboard is used to show the performance of the **automatic evaluation methods of LLMs** submitted by the **AEOLLM team** on four tasks:
- Dialogue Generation (DG)
- Text Expansion (TE)
- Summary Generation (SG)
- Non-Factoid QA (NFQA)
            
Details of AEOLLLM can be found at the link: [https://aeollm.github.io/](https://aeollm.github.io/)
""", unsafe_allow_html=True)
# 创建示例数据

# teamId 唯一标识码
DG = {
    "teamId": ["baseline1", "baseline2", "baseline3", "baseline4"],
    "methods": ["chatglm3-6b", "baichuan2-13b", "chatglm-pro", "gpt-4o-mini"],
    "accuracy": [0.5806, 0.5483, 0.6001, 0.6472],
    "kendall's tau": [0.3243, 0.1739, 0.3042, 0.4167],
    "spearman": [0.3505, 0.1857, 0.3264, 0.4512]
}

df1 = pd.DataFrame(DG)
for col in df1.select_dtypes(include=['float64', 'int64']).columns:
    df1[col] = df1[col].apply(lambda x: f"{x:.4f}")

TE = {
    "teamId": ["baseline1", "baseline2", "baseline3", "baseline4"],
    "methods": ["chatglm3-6b", "baichuan2-13b", "chatglm-pro", "gpt-4o-mini"],
    "accuracy": [0.5107, 0.5050, 0.5461, 0.5581],
    "kendall's tau": [0.1281, 0.0635, 0.2716, 0.3864],
    "spearman": [0.1352, 0.0667, 0.2867, 0.4157]
}
df2 = pd.DataFrame(TE)
for col in df2.select_dtypes(include=['float64', 'int64']).columns:
    df2[col] = df2[col].apply(lambda x: f"{x:.4f}")

SG = {
    "teamId": ["baseline1", "baseline2", "baseline3", "baseline4"],
    "methods": ["chatglm3-6b", "baichuan2-13b", "chatglm-pro", "gpt-4o-mini"],
    "accuracy": [0.6504, 0.6014, 0.7162, 0.7441],
    "kendall's tau": [0.3957, 0.2688, 0.5092, 0.5001],
    "spearman": [0.4188, 0.2817, 0.5403, 0.5405],
}
df3 = pd.DataFrame(SG)
for col in df3.select_dtypes(include=['float64', 'int64']).columns:
    df3[col] = df3[col].apply(lambda x: f"{x:.4f}")

NFQA = {
    "teamId": ["baseline1", "baseline2", "baseline3", "baseline4"],
    "methods": ["chatglm3-6b", "baichuan2-13b", "chatglm-pro", "gpt-4o-mini"],
    "accuracy": [0.5935, 0.5817, 0.7000, 0.7203],
    "kendall's tau": [0.2332, 0.2389, 0.4440, 0.4235],
    "spearman": [0.2443, 0.2492, 0.4630, 0.4511]
}
df4 = pd.DataFrame(NFQA)
for col in df4.select_dtypes(include=['float64', 'int64']).columns:
    df4[col] = df4[col].apply(lambda x: f"{x:.4f}")

# 创建标签页
tab1, tab2, tab3, tab4 = st.tabs(["DG", "TE", "SG", "NFQA"])

with tab1:
    st.markdown("""Task: Dialogue Generation; Dataset: DialyDialog""", unsafe_allow_html=True)
    st.dataframe(df1, use_container_width=True)

with tab2:
    st.markdown("""Task: Text Expansion; Dataset: WritingPrompts""", unsafe_allow_html=True)
    st.dataframe(df2, use_container_width=True)

with tab3:
    st.markdown("""Task: Summary Generation; Dataset: Xsum""", unsafe_allow_html=True)
    st.dataframe(df3, use_container_width=True)

with tab4:
    st.markdown("""Task: Non-Factoid QA; Dataset: NF_CATS""", unsafe_allow_html=True)
    st.dataframe(df4, use_container_width=True)