Spaces:

llm-council
/

llm-council

Running

File size: 22,038 Bytes

169dd3c
a056e0b
7ee6d4e
 
 
29e2769
bd4620c
169dd3c
a056e0b
 
 
 
 
 
 
68bf69f
 
 
 
 
 
 
 
 
 
 
a056e0b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7ee6d4e
 
 
 
 
 
 
 
68bf69f
0e92fc0
68bf69f
 
 
 
 
 
 
 
a146b18
68bf69f
 
29e2769
a056e0b
68bf69f
 
 
e10e00e
68bf69f
bfcc00c
68bf69f
 
bd4620c
68bf69f
 
bd4620c
68bf69f
 
bd4620c
 
68bf69f
 
 
 
 
 
 
bfcc00c
 
68bf69f
 
bfcc00c
0e92fc0
 
 
 
 
 
 
 
 
 
 
 
68bf69f
29e2769
68bf69f
 
 
56dd9ac
68bf69f
 
56dd9ac
68bf69f
56dd9ac
29e2769
68bf69f
56dd9ac
29e2769
68bf69f
 
 
 
 
 
 
29e2769
68bf69f
 
 
 
 
 
 
29e2769
68bf69f
 
 
 
 
 
 
29e2769
56dd9ac
 
 
 
 
 
 
29e2769
68bf69f
 
 
29e2769
68bf69f
 
 
29e2769
68bf69f
 
56dd9ac
68bf69f
 
 
29e2769
 
68bf69f
 
 
 
b52aa9e
68bf69f
 
 
7ee6d4e
a056e0b
68bf69f
 
 
 
a056e0b
68bf69f
 
 
bfcc00c
68bf69f
 
 
 
 
7ee6d4e
a056e0b
68bf69f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bfcc00c
68bf69f
bfcc00c
68bf69f
a056e0b
68bf69f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b30a2c5
68bf69f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29e2769
68bf69f
 
29e2769
 
a056e0b
 
68bf69f
 
 
 
 
 
 
a056e0b
 
68bf69f
bfcc00c
 
68bf69f
 
a129336
bfcc00c
 
 
68bf69f
 
 
 
 
 
 
 
 
a056e0b
68bf69f
a056e0b
68bf69f
7ee6d4e
68bf69f
a056e0b
 
68bf69f
 
 
 
 
 
 
29e2769
a056e0b
68bf69f
 
 
 
 
 
a056e0b
68bf69f
bfcc00c
 
68bf69f
 
 
bfcc00c
 
 
a056e0b
 
68bf69f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bfcc00c
 
68bf69f
 
 
bfcc00c
 
 
a129336
68bf69f
e10e00e
b52aa9e
68bf69f
 
a129336
68bf69f
 
 
 
 
 
 
 
 
 
a129336
68bf69f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e10e00e
68bf69f
 
 
a129336
68bf69f
 
 
 
 
 
 
 
a129336
68bf69f
 
 
 
 
 
 
 
a129336
68bf69f
 
 
 
 
 
b52aa9e
68bf69f
 
 
 
b52aa9e
68bf69f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56dd9ac
68bf69f
56dd9ac
 
 
 
 
 
 
68bf69f
 
bd4620c
68bf69f
 
 
 
 
 
 
 
 
 
 
 
 
a056e0b
3a053a2
68bf69f

import streamlit as st
import pandas as pd
from PIL import Image
import base64
from io import BytesIO
import random
import plotly.graph_objects as go

# Define constants
MAJOR_A_WIN = "A>>B"
MINOR_A_WIN = "A>B"
MINOR_B_WIN = "B>A"
MAJOR_B_WIN = "B>>A"
TIE = "A=B"

GA_TRACKING_CODE = """
<script async src="https://www.googletagmanager.com/gtag/js?id=G-EVZ0R7014L"></script>
<script>
  window.dataLayer = window.dataLayer || [];
  function gtag(){dataLayer.push(arguments);}
  gtag('js', new Date());

  gtag('config', 'G-EVZ0R7014L');
</script>
"""


def is_consistent(rating, reverse_rating):
    if rating in {MAJOR_A_WIN, MINOR_A_WIN} and reverse_rating in {
        MAJOR_B_WIN,
        MINOR_B_WIN,
    }:
        return True
    if rating in {MAJOR_B_WIN, MINOR_B_WIN} and reverse_rating in {
        MAJOR_A_WIN,
        MINOR_A_WIN,
    }:
        return True
    if reverse_rating in {MAJOR_A_WIN, MINOR_A_WIN} and rating in {
        MAJOR_B_WIN,
        MINOR_B_WIN,
    }:
        return True
    if reverse_rating in {MAJOR_B_WIN, MINOR_B_WIN} and rating in {
        MAJOR_A_WIN,
        MINOR_A_WIN,
    }:
        return True
    if reverse_rating in {TIE} and rating in {TIE}:
        return True
    if reverse_rating in {TIE} and rating not in {TIE}:
        return False
    if rating in {TIE} and reverse_rating not in {TIE}:
        return False
    return False


# Function to convert PIL image to base64
def pil_to_base64(img):
    buffered = BytesIO()
    img.save(buffered, format="PNG")
    img_str = base64.b64encode(buffered.getvalue()).decode()
    return img_str


def main():

    # Load your dataframes
    df_test_set = pd.read_json("data/test_set.jsonl", lines=True)
    df_responses = pd.read_json("data/responses.jsonl", lines=True)
    df_response_judging = pd.read_json("data/response_judging.jsonl", lines=True)
    df_leaderboard = (
        pd.read_csv("data/leaderboard_6_11.csv")
        .sort_values("Rank")
        .reset_index(drop=True)
    )
    df_leaderboard = df_leaderboard.rename(
        columns={"EI Score": "Council Arena EI Score (95% CI)"}
    )

    # Prepare the scenario selector options
    df_test_set["scenario_option"] = (
        df_test_set["emobench_id"].astype(str) + ": " + df_test_set["scenario"]
    )
    scenario_options = df_test_set["scenario_option"].tolist()

    # Prepare the model selector options
    model_options = df_responses["llm_responder"].unique().tolist()

    # Prepare the judge selector options
    judge_options = df_response_judging["llm_judge"].unique().tolist()

    st.set_page_config(
        page_title="Language Model Council", page_icon="🏛️", layout="wide"
    )

    # Custom CSS to center title and header
    center_css = """
    <style>
    h1, h2, h3, h6{
        text-align: center;
    }
    </style>
    """

    # Add the Google Analytics tracking code to the Streamlit app
    st.markdown(GA_TRACKING_CODE, unsafe_allow_html=True)

    # Remove streamlit's hamburger menu.
    st.markdown(
        """
<style>
.stApp [data-testid="stToolbar"]{
    display:none;
}
</style>
""",
        unsafe_allow_html=True,
    )

    st.markdown(center_css, unsafe_allow_html=True)

    # Title and subtitle.
    st.title("Language Model Council")
    st.markdown(
        "### Democratically Benchmarking Foundation Models on Highly Subjective Tasks :classical_building:"
    )
    st.markdown(
        "###### [Justin Zhao](https://www.justinxzhao.com/)¹, [Flor Miriam Plaza-del-Arco](https://fmplaza.github.io/)², [Benjamin Genchel](https://bgenchel.github.io/)¹, [Amanda Cercas Curry](https://amandacurry.github.io/)³"
    )
    st.markdown("###### ¹ Independent, ² Bocconi University, ³ CENTAI Institute")

    # Create three columns
    _, col1, col2, col3, _ = st.columns([0.3, 0.1, 0.1, 0.1, 0.3])

    with col1:
        st.link_button(
            "Data",
            "https://huggingface.co/datasets/llm-council/emotional_application",
            use_container_width=True,
            type="primary",
        )

    with col2:
        st.link_button(
            "Paper",
            "https://arxiv.org/abs/2406.08598",
            use_container_width=True,
            type="primary",
        )

    with col3:
        st.link_button(
            "Github",
            "https://github.com/llm-council/llm-council",
            use_container_width=True,
            type="primary",
        )

    # with col4:
    #     st.link_button(
    #         "Website",
    #         "https://llm-council.com/",
    #         use_container_width=True,
    #         type="primary",
    #     )

    # Render hero image.
    with open("img/hero.svg", "r") as file:
        svg_content = file.read()

    left_co, cent_co, last_co = st.columns([0.2, 0.6, 0.2])
    with cent_co:
        st.image(svg_content, use_column_width=True)

    with cent_co.expander("Abstract"):
        st.markdown(
            """As Large Language Models (LLMs) continue to evolve, evaluating them remains a persistent challenge. Many recent evaluations use LLMs as judges to score outputs from other LLMs, often relying on a single large model like GPT-4o. However, using a single LLM judge is prone to intra-model bias, and many tasks - such as those related to emotional intelligence, creative writing, and persuasiveness - may be too subjective for a single model to judge fairly. We introduce the Language Model Council (LMC), where a group of LLMs collaborate to create tests, respond to them, and evaluate each other's responses to produce a ranking in a democratic fashion. Unlike previous approaches that focus on reducing cost or bias by using a panel of smaller models, our work examines the benefits and nuances of a fully inclusive LLM evaluation system. In a detailed case study on emotional intelligence, we deploy a council of 20 recent LLMs to rank each other on open-ended responses to interpersonal conflicts. Our results show that the LMC produces rankings that are more separable and more robust, and through a user study, we show that they are more consistent with human evaluations than any individual LLM judge. Using all LLMs for judging can be costly, however, so we use Monte Carlo simulations and hand-curated sub-councils to study hypothetical council compositions and discuss the value of the incremental LLM judge."""
        )
    st.markdown(
        "This leaderboard comes from deploying a Council of 20 LLMs on an **open-ended emotional intelligence task: responding to interpersonal dilemmas**."
    )

    # Create horizontal tabs
    tabs = st.tabs(
        [
            "Leaderboard Results",
            "Browse Data",
            "Analysis",
            "About Us",
        ]
    )

    # Define content for each tab
    with tabs[0]:
        _, mid_column, _ = st.columns([0.2, 0.6, 0.2])
        mid_column.markdown("#### Leaderboard Graph")

        df = df_leaderboard.copy()
        df["Score"] = df["Council Arena EI Score (95% CI)"].apply(
            lambda x: float(x.split(" ")[0])
        )
        df["Lower"] = df["Council Arena EI Score (95% CI)"].apply(
            lambda x: float(x.split(" ")[1][1:-1])
        )
        df["Upper"] = df["Council Arena EI Score (95% CI)"].apply(
            lambda x: float(x.split(" ")[2][:-1])
        )

        # Sort the DataFrame by Score in descending order
        df = df.sort_values(by="Score", ascending=False)

        # Create the bar chart
        fig = go.Figure()

        # Generate rainbow colors
        num_bars = len(df)
        colors = [f"hsl({int(360 / num_bars * i)}, 100%, 50%)" for i in range(num_bars)]

        fig.add_trace(
            go.Bar(
                x=df["Score"],
                y=df["LLM"],
                orientation="h",
                error_x=dict(
                    type="data",
                    array=df["Upper"],
                    arrayminus=-1 * df["Lower"],
                    thickness=0.5,
                    width=3,
                    color="black",
                ),
                marker=dict(color=colors, opacity=0.8),
            )
        )

        fig.update_layout(
            xaxis=dict(title="Council Emotional Intelligence Score", showgrid=True),
            yaxis_title="LLM",
            yaxis=dict(autorange="reversed"),
            template="presentation",
            width=1000,
            height=700,
        )

        # Display the plot in Streamlit
        mid_column.plotly_chart(fig)

        mid_column.divider()

        mid_column.markdown("#### Leaderboard Table")

        # Display the table.
        mid_column.dataframe(df_leaderboard, hide_index=True)

    # HTML and CSS to create a text box with specified color
    def colored_text_box(text, background_color, text_color="black"):
        html_code = f"""
        <div style="
            background-color: {background_color};
            color: {text_color};
            padding: 10px;
            border-radius: 10px;
            ">
            {text}
        </div>
        """
        return html_code

    # Ensure to initialize session state variables if they do not exist
    if "selected_scenario" not in st.session_state:
        st.session_state.selected_scenario = None

    if "selected_model" not in st.session_state:
        st.session_state.selected_model = None

    if "selected_judge" not in st.session_state:
        st.session_state.selected_judge = None

    # Define callback functions to update session state
    def update_scenario():
        st.session_state.selected_scenario = st.session_state.scenario_selector

    def update_model():
        st.session_state.selected_model = st.session_state.model_selector

    def update_judge():
        st.session_state.selected_judge = st.session_state.judge_selector

    def randomize_selection():
        st.session_state.selected_scenario = random.choice(scenario_options)
        st.session_state.selected_model = random.choice(model_options)
        st.session_state.selected_judge = random.choice(judge_options)

    with tabs[1]:
        # Add randomize button at the top of the app
        _, mid_column, _ = st.columns([0.4, 0.2, 0.4])
        mid_column.button(
            ":game_die: Randomize!",
            on_click=randomize_selection,
            type="primary",
            use_container_width=True,
        )

        st.markdown("#### 1. Select a scenario.")
        # Create the selectors
        st.session_state.selected_scenario = st.selectbox(
            "Select Scenario",
            scenario_options,
            label_visibility="hidden",
            key="scenario_selector",
            on_change=update_scenario,
            index=(
                scenario_options.index(st.session_state.selected_scenario)
                if st.session_state.selected_scenario
                else 0
            ),
        )

        # Get the selected scenario details
        if st.session_state.selected_scenario:
            selected_emobench_id = int(
                st.session_state.selected_scenario.split(": ")[0]
            )
            scenario_details = df_test_set[
                df_test_set["emobench_id"] == selected_emobench_id
            ].iloc[0]

            # Display the detailed dilemma and additional information
            st.markdown(
                colored_text_box(
                    scenario_details["detailed_dilemma"],
                    "#01204E",
                    "white",
                ),
                unsafe_allow_html=True,
            )
            with st.expander("Additional Information"):
                st.write(
                    {
                        "LLM Author": scenario_details["llm_author"],
                        "Problem": scenario_details["problem"],
                        "Relationship": scenario_details["relationship"],
                        "Scenario": scenario_details["scenario"],
                    }
                )

        st.divider()

        st.markdown("#### 2. View responses.")

        # Create two columns for model selectors
        col1, col2 = st.columns(2)

        with col1:
            fixed_model = "qwen1.5-32B-Chat"
            st.selectbox(
                "Select Model",
                [fixed_model],
                key="fixed_model",
                label_visibility="hidden",
            )

            # Get the response string for the fixed model
            if st.session_state.selected_scenario:
                response_details_fixed = df_responses[
                    (df_responses["emobench_id"] == selected_emobench_id)
                    & (df_responses["llm_responder"] == fixed_model)
                ].iloc[0]

                # Display the response string
                st.markdown(
                    colored_text_box(
                        response_details_fixed["response_string"],
                        "#028391",
                        "white",
                    ),
                    unsafe_allow_html=True,
                )

        with col2:
            st.session_state.selected_model = st.selectbox(
                "Select Model",
                model_options,
                key="model_selector",
                on_change=update_model,
                index=(
                    model_options.index(st.session_state.selected_model)
                    if st.session_state.selected_model
                    else 0
                ),
            )

            # Get the response string for the selected model
            if st.session_state.selected_model and st.session_state.selected_scenario:
                response_details_dynamic = df_responses[
                    (df_responses["emobench_id"] == selected_emobench_id)
                    & (df_responses["llm_responder"] == st.session_state.selected_model)
                ].iloc[0]

                # Display the response string
                st.markdown(
                    colored_text_box(
                        response_details_dynamic["response_string"],
                        "#028391",
                        "white",
                    ),
                    unsafe_allow_html=True,
                )

        st.divider()

        st.markdown("#### 3. Response judging.")
        st.markdown("##### All council members")
        col1, col2 = st.columns(2)

        with col1:
            st.write(f"**{fixed_model}** vs **{st.session_state.selected_model}**")
            pairwise_counts_left = df_response_judging[
                (df_response_judging["first_completion_by"] == fixed_model)
                & (
                    df_response_judging["second_completion_by"]
                    == st.session_state.selected_model
                )
            ]["pairwise_choice"].value_counts()
            st.bar_chart(pairwise_counts_left)

        with col2:
            st.write(f"**{st.session_state.selected_model}** vs **{fixed_model}**")
            pairwise_counts_right = df_response_judging[
                (
                    df_response_judging["first_completion_by"]
                    == st.session_state.selected_model
                )
                & (df_response_judging["second_completion_by"] == fixed_model)
            ]["pairwise_choice"].value_counts()
            st.bar_chart(pairwise_counts_right)

        # Create the llm_judge selector
        st.markdown("##### Individual LLM judges")
        st.session_state.selected_judge = st.selectbox(
            "Select Judge",
            judge_options,
            label_visibility="hidden",
            key="judge_selector",
            on_change=update_judge,
            index=(
                judge_options.index(st.session_state.selected_judge)
                if st.session_state.selected_judge
                else 0
            ),
        )

        # Get the judging details for the selected judge and models
        if st.session_state.selected_judge and st.session_state.selected_scenario:
            col1, col2 = st.columns(2)

            judging_details_left = df_response_judging[
                (df_response_judging["llm_judge"] == st.session_state.selected_judge)
                & (df_response_judging["first_completion_by"] == fixed_model)
                & (
                    df_response_judging["second_completion_by"]
                    == st.session_state.selected_model
                )
            ].iloc[0]

            judging_details_right = df_response_judging[
                (df_response_judging["llm_judge"] == st.session_state.selected_judge)
                & (
                    df_response_judging["first_completion_by"]
                    == st.session_state.selected_model
                )
                & (df_response_judging["second_completion_by"] == fixed_model)
            ].iloc[0]

            # Render consistency.
            if is_consistent(
                judging_details_left["pairwise_choice"],
                judging_details_right["pairwise_choice"],
            ):
                st.success(
                    f"{st.session_state.selected_judge} as a judge was consistent on this example with positions flipped.",
                    icon="✅",
                )
            else:
                st.warning(
                    f"{st.session_state.selected_judge} as a judge was inconsistent on this example with positions flipped.",
                    icon="⚠️",
                )

            # Display the judging details
            with col1:
                if not judging_details_left.empty:
                    st.write(
                        f"**Pairwise Choice:** {judging_details_left['pairwise_choice']}"
                    )
                    st.markdown(
                        colored_text_box(
                            judging_details_left["judging_response_string"],
                            "#FEAE6F",
                            "black",
                        ),
                        unsafe_allow_html=True,
                    )
                else:
                    st.write("No judging details found for the selected combination.")

            with col2:
                if not judging_details_right.empty:
                    st.write(
                        f"**Pairwise Choice:** {judging_details_right['pairwise_choice']}"
                    )
                    st.markdown(
                        colored_text_box(
                            judging_details_right["judging_response_string"],
                            "#FEAE6F",
                            "black",
                        ),
                        unsafe_allow_html=True,
                    )
                else:
                    st.write("No judging details found for the selected combination.")

    with tabs[2]:
        st.markdown("### Battles (Respondent vs. Respondent)")
        st.markdown("###### Expected win rates based on Terry-Bradley coefficients")
        image = Image.open("img/llm_vs_llm_win_rates.png")
        img_base64 = pil_to_base64(image)
        centered_image_html = f"""
        <div style="text-align: center;">
            <img src="data:image/png;base64,{img_base64}" width="1000"/>
        </div>
        """
        st.markdown(centered_image_html, unsafe_allow_html=True)

        st.divider()

        st.markdown("### Affinities (Judge vs. Respondent)")

        st.markdown("###### Raw affinities")
        image = Image.open("img/raw.png")
        img_base64 = pil_to_base64(image)
        centered_image_html = f"""
        <div style="text-align: center;">
            <img src="data:image/png;base64,{img_base64}" width="1000"/>
        </div>
        """
        st.markdown(centered_image_html, unsafe_allow_html=True)

        # Some extra space.
        st.text("")
        st.text("")
        st.text("")

        st.markdown("###### Council-Normalized")
        image = Image.open("img/council_normalized.png")
        img_base64 = pil_to_base64(image)
        centered_image_html = f"""
        <div style="text-align: center;">
            <img src="data:image/png;base64,{img_base64}" width="1000"/>
        </div>
        """
        st.markdown(centered_image_html, unsafe_allow_html=True)

        st.divider()

        st.markdown("### Agreement (Judge vs. Judge)")

        st.markdown("###### Sidewise Cohen's Kappa:")
        image = Image.open("img/judge_agreement.sidewise_cohen_kappa.png")
        img_base64 = pil_to_base64(image)
        centered_image_html = f"""
        <div style="text-align: center;">
            <img src="data:image/png;base64,{img_base64}" width="1000"/>
        </div>
        """
        st.markdown(centered_image_html, unsafe_allow_html=True)

        st.write("Check out the paper for more detailed analysis!")

    with tabs[-1]:
        st.markdown(
            """**Motivation**:

Good LLM evaluations are [really hard](https://www.jasonwei.net/blog/evals), and newly released models often make their own claims about being the best at something, often citing its position on a benchmark or a leaderboard. But what if we let the models themselves decide who's the best?

**Main collaborators**:
- [Justin Zhao](https://x.com/justinxzhao)
- [Flor Plaza](https://x.com/florplaza22)
- [Sam Paech](https://x.com/sam_paech)
- [Federico Bianchi](https://x.com/federicobianchy)
- [Sahand Sabour](https://x.com/SahandSabour)
- [Amanda Cercas Curry](https://x.com/CurriedAmanda)
        """
        )

    # st.markdown("#### Citation")
    with st.expander("Citation"):
        st.write(
            "Please cite the following paper if you find our leaderboard, dataset, or framework helpful."
        )
        st.code(
            """@misc{zhao2024council,
        Title = {Language Model Council: Benchmarking Foundation Models on Highly Subjective Tasks by Consensus},
        Author = {Justin Zhao and Flor Miriam Plaza-del-Arco and Amanda Cercas Curry},
        Year = {2024}
        Eprint = {arXiv:2406.08598},
    }"""
        )


if __name__ == "__main__":
    main()