Spaces:

per
/

benchbench

Running

App Files Files Community

Yotam-Perlitz commited on Sep 5, 2024

Commit

363d8ae

1 Parent(s): 566ad63

improve bench upload code

Browse files

Signed-off-by: Yotam-Perlitz <[email protected]>

Files changed (1) hide show

app.py +76 -31

app.py CHANGED Viewed

@@ -5,6 +5,7 @@ import pandas as pd
 import plotly.express as px
 import streamlit as st
 from bat import Benchmark, Config, Reporter, Tester
 holistic_scenarios = [
@@ -58,7 +59,7 @@ with st.expander("Leaderboard configurations (defaults are great BTW)", icon="
             label="Select Correlation type", options=["kendall", "pearson"], index=0
         )
-        aggragate_scenario_whitelist = aggragate_scenarios
         # [
         #     scen
         #     for scen in all_scenarios_for_aggragate
@@ -98,7 +99,38 @@ with st.expander("Add your benchmarks here!", icon="🔥"):
     my_benchmark = Benchmark()
     if uploaded_file is not None:
         df = pd.read_csv(uploaded_file)
-        my_benchmark.assign_df(df, data_source="Uploaded Benchmark")
 def run_load(
@@ -108,7 +140,7 @@ def run_load(
     corr_types=["kendall"],
     n_exps=10,
     my_benchmark=Benchmark(),
-    use_caching=True,
 ):
     # Create a hash of the inputs to generate a unique cache file for each set of inputs
     input_str = (
@@ -147,24 +179,6 @@ def run_load(
     else:
         print("Cached results not found, calculating")
-        cfg = Config(
-            exp_to_run="example",
-            n_models_taken_list=n_models_taken_list,
-            model_select_strategy_list=model_select_strategy_list,
-            corr_types=corr_types,
-            n_exps=n_exps if n_models_taken_list != [0] else 1,
-        )
-        # holistic = Benchmark()
-        # holistic.load_local_catalog()
-        # holistic.df = holistic.df.query("scenario in @holistic_scenarios")
-        # holistic.clear_repeated_scenarios()
-        # aggragate_scores = holistic.df.query('scenario=="aggregate"')[
-        #     ["model", "score"]
-        # ].sort_values(by="score", ascending=False)
         allbench = Benchmark()
         allbench.load_local_catalog()
@@ -172,22 +186,43 @@ def run_load(
             new_col_name="aggregate",
             agg_source_name="aggregate",
             scenario_whitelist=aggregate_scenario_whitelist,
-            min_scenario_for_models_to_appear_in_agg=1,
         )
         aggragate_scores = allbench.df.query('scenario=="aggregate"')[
             ["model", "score"]
         ].sort_values(by="score", ascending=False)
-        # allbench.df = allbench.df[~allbench.df["source"].str.contains("livebench")]
-        allbench.extend(my_benchmark)
-        # allbench.df = allbench.df.drop(columns=["tag"])
-        allbench.clear_repeated_scenarios()
-        # removing and adding the holistic scenarios
-        # allbench.df = allbench.df.query("scenario not in @holistic_scenarios")
-        # allbench = allbench.extend(holistic)
         tester = Tester(cfg=cfg)
@@ -205,7 +240,7 @@ def run_load(
 agreements, aggragare_score_df = run_load(
-    aggregate_scenario_whitelist=aggragate_scenario_whitelist,
     n_models_taken_list=n_models_taken_list,
     model_select_strategy_list=[model_select_strategy],
     corr_types=[corr_type],
@@ -255,7 +290,7 @@ data = (
 # Apply coloring based on 'Z' valuesz
 def highlight_uploaded_benchmark(row):
-    if row["Source"] == "Uploaded Benchmark":
         return ["background-color: rgba(100,100,100,0.1)"] * len(row)
     else:
         return [""] * len(row)
@@ -314,6 +349,16 @@ with st.expander(label="Model scored by the aggragate"):
 with st.expander(label="Citations"):
     st.code(
         r"""
     @misc{liu2023agentbenchevaluatingllmsagents,
         title={AgentBench: Evaluating LLMs as Agents},
         author={Xiao Liu and Hao Yu and Hanchen Zhang and Yifan Xu and Xuanyu Lei and Hanyu Lai and Yu Gu and Hangliang Ding and Kaiwen Men and Kejuan Yang and Shudan Zhang and Xiang Deng and Aohan Zeng and Zhengxiao Du and Chenhui Zhang and Sheng Shen and Tianjun Zhang and Yu Su and Huan Sun and Minlie Huang and Yuxiao Dong and Jie Tang},

 import plotly.express as px
 import streamlit as st
 from bat import Benchmark, Config, Reporter, Tester
+from datetime import datetime
 holistic_scenarios = [
             label="Select Correlation type", options=["kendall", "pearson"], index=0
         )
+        aggregate_scenario_whitelist = aggragate_scenarios
         # [
         #     scen
         #     for scen in all_scenarios_for_aggragate
     my_benchmark = Benchmark()
     if uploaded_file is not None:
         df = pd.read_csv(uploaded_file)
+        my_benchmark.assign_df(
+            df,
+            data_source=f"uploaded_benchmark_{datetime.now().strftime('%y%m%d')}.csv",
+        )
+        allbench = Benchmark()
+        allbench.load_local_catalog()
+        allbench.add_aggregate(
+            new_col_name="aggregate",
+            agg_source_name="aggregate",
+            scenario_whitelist=aggregate_scenario_whitelist,
+            min_scenario_for_models_to_appear_in_agg=1
+            if len(aggregate_scenario_whitelist) == 1
+            else 3,
+        )
+        uploaded_models = my_benchmark.df[
+            my_benchmark.df["source"].str.contains("uploaded")
+        ]["model"].unique()
+        aggregate_models = allbench.df[allbench.df["source"].str.contains("aggregate")][
+            "model"
+        ].unique()
+        # Find the intersection (overlap) of models
+        overlap_models = set(aggregate_models).intersection(uploaded_models)
+        if len(overlap_models) < n_models_taken_list[0]:
+            st.warning(
+                f"You have just {len(overlap_models)} models intersecting with the aggregate!"
+                f"Here are some models you should run your benchmark over:{aggregate_models}"
+            )
 def run_load(
     corr_types=["kendall"],
     n_exps=10,
     my_benchmark=Benchmark(),
+    use_caching=False,
 ):
     # Create a hash of the inputs to generate a unique cache file for each set of inputs
     input_str = (
     else:
         print("Cached results not found, calculating")
         allbench = Benchmark()
         allbench.load_local_catalog()
             new_col_name="aggregate",
             agg_source_name="aggregate",
             scenario_whitelist=aggregate_scenario_whitelist,
+            min_scenario_for_models_to_appear_in_agg=1
+            if len(aggregate_scenario_whitelist) == 1
+            else 2,
         )
+        allbench.extend(my_benchmark)
+        allbench.clear_repeated_scenarios()
         aggragate_scores = allbench.df.query('scenario=="aggregate"')[
             ["model", "score"]
         ].sort_values(by="score", ascending=False)
+        if not my_benchmark.is_empty:
+            aggragate_scores["in_uploaded"] = aggragate_scores["model"].apply(
+                lambda x: x in my_benchmark.df["model"].unique()
+            )
+            # Get unique models for each scenario
+            uploaded_models = allbench.df[
+                allbench.df["source"].str.contains("uploaded")
+            ]["model"].unique()
+            aggregate_models = allbench.df[
+                allbench.df["source"].str.contains("aggregate")
+            ]["model"].unique()
+            # Find the intersection (overlap) of models
+            n_overlap_models = len(set(aggregate_models).intersection(uploaded_models))
+            # make sure we are asking for the maximal number of models between the request benchmark and the aggregate
+            n_models_taken_list = [min(n_models_taken_list[0], n_overlap_models)]
+        cfg = Config(
+            exp_to_run="example",
+            n_models_taken_list=n_models_taken_list,
+            model_select_strategy_list=model_select_strategy_list,
+            corr_types=corr_types,
+            n_exps=n_exps if n_models_taken_list != [0] else 1,
+        )
         tester = Tester(cfg=cfg)
 agreements, aggragare_score_df = run_load(
+    aggregate_scenario_whitelist=aggregate_scenario_whitelist,
     n_models_taken_list=n_models_taken_list,
     model_select_strategy_list=[model_select_strategy],
     corr_types=[corr_type],
 # Apply coloring based on 'Z' valuesz
 def highlight_uploaded_benchmark(row):
+    if "uploaded_benchmark" in row["Source"]:
         return ["background-color: rgba(100,100,100,0.1)"] * len(row)
     else:
         return [""] * len(row)
 with st.expander(label="Citations"):
     st.code(
         r"""
+    @misc{berkeley-function-calling-leaderboard,
+        title={Berkeley Function Calling Leaderboard},
+        author={Fanjia Yan and Huanzhi Mao and Charlie Cheng-Jie Ji
+        and Tianjun Zhang and Shishir G. Patil and Ion Stoica and Joseph E.
+        Gonzalez},
+        howpublished={\url{https://gorilla.cs.berkeley.edu/blogs/8_berkeley_function_calling_leaderboard.html}},
+        year={2024},
+    }
     @misc{liu2023agentbenchevaluatingllmsagents,
         title={AgentBench: Evaluating LLMs as Agents},
         author={Xiao Liu and Hao Yu and Hanchen Zhang and Yifan Xu and Xuanyu Lei and Hanyu Lai and Yu Gu and Hangliang Ding and Kaiwen Men and Kejuan Yang and Shudan Zhang and Xiang Deng and Aohan Zeng and Zhengxiao Du and Chenhui Zhang and Sheng Shen and Tianjun Zhang and Yu Su and Huan Sun and Minlie Huang and Yuxiao Dong and Jie Tang},