Spaces:

llm-council
/

emotional-intelligence-arena

Running

App Files Files Community

justinxzhao commited on Jun 11, 2024

Commit

a056e0b

1 Parent(s): 312e7a9

Initial version of data tab browser.

Browse files

Files changed (2) hide show

.gitignore +2 -1
app.py +247 -2

.gitignore CHANGED Viewed

	@@ -1 +1,2 @@
1	- env/


1	+ env/
2	+ .DS_Store

app.py CHANGED Viewed

@@ -1,4 +1,249 @@
 import streamlit as st
-x = st.slider("Select a value")
-st.write(x, "squared is", x * x)

 import streamlit as st
+import pandas as pd
+# Define constants
+MAJOR_A_WIN = "A>>B"
+MINOR_A_WIN = "A>B"
+MINOR_B_WIN = "B>A"
+MAJOR_B_WIN = "B>>A"
+TIE = "A=B"
+def is_consistent(rating, reverse_rating):
+    if rating in {MAJOR_A_WIN, MINOR_A_WIN} and reverse_rating in {
+        MAJOR_B_WIN,
+        MINOR_B_WIN,
+    }:
+        return True
+    if rating in {MAJOR_B_WIN, MINOR_B_WIN} and reverse_rating in {
+        MAJOR_A_WIN,
+        MINOR_A_WIN,
+    }:
+        return True
+    if reverse_rating in {MAJOR_A_WIN, MINOR_A_WIN} and rating in {
+        MAJOR_B_WIN,
+        MINOR_B_WIN,
+    }:
+        return True
+    if reverse_rating in {MAJOR_B_WIN, MINOR_B_WIN} and rating in {
+        MAJOR_A_WIN,
+        MINOR_A_WIN,
+    }:
+        return True
+    if reverse_rating in {TIE} and rating in {TIE}:
+        return True
+    if reverse_rating in {TIE} and rating not in {TIE}:
+        return False
+    if rating in {TIE} and reverse_rating not in {TIE}:
+        return False
+    return False
+# Load your dataframes
+df_test_set = pd.read_json("data/test_set.jsonl", lines=True)
+df_responses = pd.read_json("data/responses.jsonl", lines=True)
+df_response_judging = pd.read_json("data/response_judging.jsonl", lines=True)
+# Prepare the scenario selector options
+df_test_set["scenario_option"] = (
+    df_test_set["emobench_id"].astype(str) + ": " + df_test_set["scenario"]
+)
+scenario_options = df_test_set["scenario_option"].tolist()
+# Prepare the model selector options
+model_options = df_responses["llm_responder"].unique().tolist()
+# Prepare the judge selector options
+judge_options = df_response_judging["llm_judge"].unique().tolist()
+st.set_page_config(page_title="Language Model Council", page_icon="🧊", layout="wide")
+# Create three columns
+col1, col2, col3 = st.columns(3)
+# Define CSS to make buttons take full space
+full_width_button_css = """
+<style>
+div.stButton > button {
+    width: 100%;
+}
+</style>
+"""
+st.markdown(full_width_button_css, unsafe_allow_html=True)
+# Place a button in each column
+with col1:
+    if st.button("Blog"):
+        st.write("Button 1 clicked")
+with col2:
+    if st.button("Paper"):
+        st.write("Button 2 clicked")
+with col3:
+    if st.button("Github"):
+        st.write("Button 3 clicked")
+# Custom CSS to center title and header
+center_css = """
+<style>
+h1, h2, h3, h4, h5, h6 {
+    text-align: center;
+}
+</style>
+"""
+st.markdown(center_css, unsafe_allow_html=True)
+st.title("Language Model Council")
+st.subheader("Applied to emotional intelligence")
+# Create horizontal tabs
+tabs = st.tabs(["Leaderboard Results", "Data Samples", "About Us"])
+# Define content for each tab
+with tabs[0]:
+    st.write("This is the leaderboard results page.")
+    # Add your leaderboard results content here
+    leaderboard = {"Name": ["Alice", "Bob", "Charlie"], "Score": [95, 85, 75]}
+    st.table(leaderboard)
+with tabs[1]:
+    # Create the selectors
+    selected_scenario = st.selectbox("Select Scenario", scenario_options)
+    # Get the selected scenario details
+    if selected_scenario:
+        selected_emobench_id = int(selected_scenario.split(": ")[0])
+        scenario_details = df_test_set[
+            df_test_set["emobench_id"] == selected_emobench_id
+        ].iloc[0]
+        # Display the detailed dilemma and additional information
+        st.write(scenario_details["detailed_dilemma"])
+        with st.expander("Additional Information"):
+            st.write(f"**LLM Author:** {scenario_details['llm_author']}")
+            st.write(f"**Problem:** {scenario_details['problem']}")
+            st.write(f"**Relationship:** {scenario_details['relationship']}")
+            st.write(f"**Scenario:** {scenario_details['scenario']}")
+    st.divider()
+    # Create two columns for model selectors
+    col1, col2 = st.columns(2)
+    with col1:
+        fixed_model = "qwen1.5-32B-Chat"
+        st.selectbox("Select Model", [fixed_model], key="fixed_model")
+        # Get the response string for the fixed model
+        if selected_scenario:
+            response_details_fixed = df_responses[
+                (df_responses["emobench_id"] == selected_emobench_id)
+                & (df_responses["llm_responder"] == fixed_model)
+            ].iloc[0]
+            # Display the response string
+            st.write(response_details_fixed["response_string"])
+    with col2:
+        selected_model = st.selectbox(
+            "Select Model", model_options, key="dynamic_model"
+        )
+        # Get the response string for the selected model
+        if selected_model and selected_scenario:
+            response_details_dynamic = df_responses[
+                (df_responses["emobench_id"] == selected_emobench_id)
+                & (df_responses["llm_responder"] == selected_model)
+            ].iloc[0]
+            # Display the response string
+            st.write(response_details_dynamic["response_string"])
+    st.divider()
+    # Create the llm_judge selector
+    selected_judge = st.selectbox("Select Judge", judge_options)
+    # Get the judging details for the selected judge and models
+    if selected_judge and selected_scenario:
+        col1, col2 = st.columns(2)
+        judging_details_left = df_response_judging[
+            (df_response_judging["llm_judge"] == selected_judge)
+            & (df_response_judging["first_completion_by"] == fixed_model)
+            & (df_response_judging["second_completion_by"] == selected_model)
+        ].iloc[0]
+        judging_details_right = df_response_judging[
+            (df_response_judging["llm_judge"] == selected_judge)
+            & (df_response_judging["first_completion_by"] == selected_model)
+            & (df_response_judging["second_completion_by"] == fixed_model)
+        ].iloc[0]
+        if is_consistent(
+            judging_details_left["pairwise_choice"],
+            judging_details_right["pairwise_choice"],
+        ):
+            st.success("The judge ratings are consistent.", icon="✅")
+        else:
+            st.warning("The judge ratings are inconsistent.", icon="⚠️")
+        # Display the judging details
+        with col1:
+            st.write(f"**{fixed_model}** vs **{selected_model}**")
+            if not judging_details_left.empty:
+                st.write(
+                    f"**Pairwise Choice:** {judging_details_left['pairwise_choice']}"
+                )
+                st.code(judging_details_left["judging_response_string"])
+            else:
+                st.write("No judging details found for the selected combination.")
+        with col2:
+            st.write(f"**{selected_model}** vs **{fixed_model}**")
+            if not judging_details_right.empty:
+                st.write(
+                    f"**Pairwise Choice:** {judging_details_right['pairwise_choice']}"
+                )
+                st.code(judging_details_right["judging_response_string"])
+            else:
+                st.write("No judging details found for the selected combination.")
+    st.divider()
+    # Add bar charts for value counts of pairwise choices over all judges
+    col1, col2 = st.columns(2)
+    with col1:
+        pairwise_counts_left = df_response_judging[
+            (df_response_judging["first_completion_by"] == fixed_model)
+            & (df_response_judging["second_completion_by"] == selected_model)
+        ]["pairwise_choice"].value_counts()
+        st.bar_chart(pairwise_counts_left)
+    with col2:
+        pairwise_counts_right = df_response_judging[
+            (df_response_judging["first_completion_by"] == selected_model)
+            & (df_response_judging["second_completion_by"] == fixed_model)
+        ]["pairwise_choice"].value_counts()
+        st.bar_chart(pairwise_counts_right)
+with tabs[2]:
+    st.write("This is the about us page.")
+    # Add your about us content here
+    st.write(
+        """
+    **Our Mission:**
+    To provide the best service and data insights.
+    **Our Team:**
+    - Alice
+    - Bob
+    - Charlie
+    """
+    )