Spaces:
Running
Running
remove legacy cost tablea tab
Browse files- app.py +3 -148
- src/populate.py +2 -5
app.py
CHANGED
@@ -1,36 +1,15 @@
|
|
1 |
import gradio as gr
|
2 |
import pandas as pd
|
3 |
|
4 |
-
from src.about import
|
5 |
-
CITATION_BUTTON_LABEL,
|
6 |
-
CITATION_BUTTON_TEXT,
|
7 |
-
INTRODUCTION_TEXT,
|
8 |
-
LLM_BENCHMARKS_TEXT,
|
9 |
-
TITLE,
|
10 |
-
)
|
11 |
from src.display.css_html_js import custom_css
|
12 |
-
from src.display.utils import
|
13 |
-
COLS,
|
14 |
-
COST_COLS,
|
15 |
-
COST_TYPES,
|
16 |
-
TS_COLS,
|
17 |
-
TS_TYPES,
|
18 |
-
TYPES,
|
19 |
-
AutoEvalColumn,
|
20 |
-
CostEvalColumn,
|
21 |
-
TSEvalColumn,
|
22 |
-
fields,
|
23 |
-
)
|
24 |
-
|
25 |
-
# from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
|
26 |
from src.envs import CRM_RESULTS_PATH
|
27 |
from src.populate import get_leaderboard_df_crm
|
28 |
|
29 |
-
original_df,
|
30 |
|
31 |
-
# raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
|
32 |
leaderboard_df = original_df.copy()
|
33 |
-
leaderboard_cost_df = cost_df.copy()
|
34 |
leaderboard_ts_df = ts_df.copy()
|
35 |
# leaderboard_df = leaderboard_df.style.format({"accuracy_metric_average": "{0:.2f}"})
|
36 |
|
@@ -60,20 +39,6 @@ def update_table(
|
|
60 |
return df.style.map(highlight_cost_band_low, props="background-color: #b3d5a4")
|
61 |
|
62 |
|
63 |
-
def update_cost_table(
|
64 |
-
hidden_df: pd.DataFrame,
|
65 |
-
columns: list,
|
66 |
-
llm_query: list,
|
67 |
-
llm_provider_query: list,
|
68 |
-
use_case_flavor_query: list,
|
69 |
-
):
|
70 |
-
filtered_df = filter_llm_func(hidden_df, llm_query)
|
71 |
-
filtered_df = filter_llm_provider_func(filtered_df, llm_provider_query)
|
72 |
-
filtered_df = filter_use_case_flavor_func(filtered_df, use_case_flavor_query)
|
73 |
-
df = select_columns_cost_table(filtered_df, columns)
|
74 |
-
return df.style.map(highlight_cost_band_low, props="background-color: #b3d5a4")
|
75 |
-
|
76 |
-
|
77 |
def update_ts_table(
|
78 |
hidden_df: pd.DataFrame,
|
79 |
columns: list,
|
@@ -125,23 +90,6 @@ def init_leaderboard_df(
|
|
125 |
)
|
126 |
|
127 |
|
128 |
-
def init_leaderboard_cost_df(
|
129 |
-
leaderboard_df: pd.DataFrame,
|
130 |
-
columns: list,
|
131 |
-
llm_query: list,
|
132 |
-
llm_provider_query: list,
|
133 |
-
use_case_type_query: list,
|
134 |
-
):
|
135 |
-
|
136 |
-
return update_cost_table(
|
137 |
-
leaderboard_df,
|
138 |
-
columns,
|
139 |
-
llm_query,
|
140 |
-
llm_provider_query,
|
141 |
-
use_case_type_query,
|
142 |
-
)
|
143 |
-
|
144 |
-
|
145 |
def init_leaderboard_ts_df(
|
146 |
leaderboard_df: pd.DataFrame,
|
147 |
columns: list,
|
@@ -183,10 +131,6 @@ def filter_use_case_type_func(df: pd.DataFrame, use_case_type_query: list) -> pd
|
|
183 |
return df[df["Use Case Type"].isin(use_case_type_query)]
|
184 |
|
185 |
|
186 |
-
def filter_use_case_flavor_func(df: pd.DataFrame, use_case_flavor_query: list) -> pd.DataFrame:
|
187 |
-
return df[df["Cost and Speed: Flavor"].isin(use_case_flavor_query)]
|
188 |
-
|
189 |
-
|
190 |
def filter_llm_func(df: pd.DataFrame, llm_query: list) -> pd.DataFrame:
|
191 |
return df[df["Model Name"].isin(llm_query)]
|
192 |
|
@@ -204,14 +148,6 @@ def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
|
|
204 |
return filtered_df
|
205 |
|
206 |
|
207 |
-
def select_columns_cost_table(df: pd.DataFrame, columns: list) -> pd.DataFrame:
|
208 |
-
always_here_cols = [
|
209 |
-
CostEvalColumn.model.name,
|
210 |
-
]
|
211 |
-
filtered_df = df[always_here_cols + [c for c in COST_COLS if c in df.columns and c in columns]]
|
212 |
-
return filtered_df
|
213 |
-
|
214 |
-
|
215 |
def select_columns_ts_table(df: pd.DataFrame, columns: list) -> pd.DataFrame:
|
216 |
always_here_cols = [
|
217 |
TSEvalColumn.model.name,
|
@@ -423,87 +359,6 @@ with demo:
|
|
423 |
leaderboard_table,
|
424 |
queue=True,
|
425 |
)
|
426 |
-
with gr.TabItem("🏅 Latency & Cost", elem_id="llm-benchmark-tab-table", id=1):
|
427 |
-
with gr.Row():
|
428 |
-
with gr.Column():
|
429 |
-
with gr.Row():
|
430 |
-
shown_columns = gr.CheckboxGroup(
|
431 |
-
choices=[c.name for c in fields(CostEvalColumn) if not c.hidden and not c.never_hidden],
|
432 |
-
value=[
|
433 |
-
c.name
|
434 |
-
for c in fields(CostEvalColumn)
|
435 |
-
if c.displayed_by_default and not c.hidden and not c.never_hidden
|
436 |
-
],
|
437 |
-
label="Select columns to show",
|
438 |
-
elem_id="column-select",
|
439 |
-
interactive=True,
|
440 |
-
)
|
441 |
-
with gr.Row():
|
442 |
-
with gr.Column():
|
443 |
-
filter_llm = gr.CheckboxGroup(
|
444 |
-
choices=list(cost_df["Model Name"].unique()),
|
445 |
-
value=list(cost_df["Model Name"].unique()),
|
446 |
-
label="Model Name",
|
447 |
-
info="",
|
448 |
-
interactive=True,
|
449 |
-
)
|
450 |
-
with gr.Column():
|
451 |
-
filter_llm_provider = gr.CheckboxGroup(
|
452 |
-
choices=list(cost_df["LLM Provider"].unique()),
|
453 |
-
value=list(cost_df["LLM Provider"].unique()),
|
454 |
-
label="LLM Provider",
|
455 |
-
info="",
|
456 |
-
interactive=True,
|
457 |
-
)
|
458 |
-
with gr.Column():
|
459 |
-
filter_use_case_type = gr.CheckboxGroup(
|
460 |
-
choices=["Long", "Short"],
|
461 |
-
value=["Long", "Short"],
|
462 |
-
label="Use Case Flavor",
|
463 |
-
info="Output: 250 tokens, Long input: 3k tokens, Short input: 500 tokens",
|
464 |
-
interactive=True,
|
465 |
-
)
|
466 |
-
|
467 |
-
leaderboard_table = gr.components.Dataframe(
|
468 |
-
value=init_leaderboard_cost_df(
|
469 |
-
leaderboard_cost_df,
|
470 |
-
shown_columns.value,
|
471 |
-
filter_llm.value,
|
472 |
-
filter_llm_provider.value,
|
473 |
-
filter_use_case_type.value,
|
474 |
-
),
|
475 |
-
headers=[c.name for c in fields(CostEvalColumn) if c.never_hidden] + shown_columns.value,
|
476 |
-
datatype=COST_TYPES,
|
477 |
-
elem_id="leaderboard-table",
|
478 |
-
interactive=False,
|
479 |
-
visible=True,
|
480 |
-
)
|
481 |
-
|
482 |
-
hidden_leaderboard_table_for_search = gr.components.Dataframe(
|
483 |
-
value=cost_df[COST_COLS],
|
484 |
-
headers=COST_COLS,
|
485 |
-
datatype=COST_TYPES,
|
486 |
-
visible=False,
|
487 |
-
)
|
488 |
-
|
489 |
-
for selector in [
|
490 |
-
shown_columns,
|
491 |
-
filter_llm,
|
492 |
-
filter_llm_provider,
|
493 |
-
filter_use_case_type,
|
494 |
-
]:
|
495 |
-
selector.change(
|
496 |
-
update_cost_table,
|
497 |
-
[
|
498 |
-
hidden_leaderboard_table_for_search,
|
499 |
-
shown_columns,
|
500 |
-
filter_llm,
|
501 |
-
filter_llm_provider,
|
502 |
-
filter_use_case_type,
|
503 |
-
],
|
504 |
-
leaderboard_table,
|
505 |
-
queue=True,
|
506 |
-
)
|
507 |
with gr.TabItem("🏅 Trust & Safety", elem_id="llm-benchmark-tab-table", id=2):
|
508 |
with gr.Row():
|
509 |
with gr.Column():
|
|
|
1 |
import gradio as gr
|
2 |
import pandas as pd
|
3 |
|
4 |
+
from src.about import CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT, INTRODUCTION_TEXT, LLM_BENCHMARKS_TEXT, TITLE
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
from src.display.css_html_js import custom_css
|
6 |
+
from src.display.utils import COLS, TS_COLS, TS_TYPES, TYPES, AutoEvalColumn, TSEvalColumn, fields
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
from src.envs import CRM_RESULTS_PATH
|
8 |
from src.populate import get_leaderboard_df_crm
|
9 |
|
10 |
+
original_df, ts_df = get_leaderboard_df_crm(CRM_RESULTS_PATH, COLS, TS_COLS)
|
11 |
|
|
|
12 |
leaderboard_df = original_df.copy()
|
|
|
13 |
leaderboard_ts_df = ts_df.copy()
|
14 |
# leaderboard_df = leaderboard_df.style.format({"accuracy_metric_average": "{0:.2f}"})
|
15 |
|
|
|
39 |
return df.style.map(highlight_cost_band_low, props="background-color: #b3d5a4")
|
40 |
|
41 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
def update_ts_table(
|
43 |
hidden_df: pd.DataFrame,
|
44 |
columns: list,
|
|
|
90 |
)
|
91 |
|
92 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
93 |
def init_leaderboard_ts_df(
|
94 |
leaderboard_df: pd.DataFrame,
|
95 |
columns: list,
|
|
|
131 |
return df[df["Use Case Type"].isin(use_case_type_query)]
|
132 |
|
133 |
|
|
|
|
|
|
|
|
|
134 |
def filter_llm_func(df: pd.DataFrame, llm_query: list) -> pd.DataFrame:
|
135 |
return df[df["Model Name"].isin(llm_query)]
|
136 |
|
|
|
148 |
return filtered_df
|
149 |
|
150 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
151 |
def select_columns_ts_table(df: pd.DataFrame, columns: list) -> pd.DataFrame:
|
152 |
always_here_cols = [
|
153 |
TSEvalColumn.model.name,
|
|
|
359 |
leaderboard_table,
|
360 |
queue=True,
|
361 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
362 |
with gr.TabItem("🏅 Trust & Safety", elem_id="llm-benchmark-tab-table", id=2):
|
363 |
with gr.Row():
|
364 |
with gr.Column():
|
src/populate.py
CHANGED
@@ -6,7 +6,7 @@ from src.display.utils import AutoEvalColumn
|
|
6 |
|
7 |
|
8 |
def get_leaderboard_df_crm(
|
9 |
-
crm_results_path: str, accuracy_cols: list,
|
10 |
) -> tuple[pd.DataFrame, pd.DataFrame]:
|
11 |
"""Creates a dataframe from all the individual experiment results"""
|
12 |
use_case_flavor_mapping_df = pd.read_csv(os.path.join(crm_results_path, "hf_leaderboard_flavor_mapping.csv"))
|
@@ -30,9 +30,6 @@ def get_leaderboard_df_crm(
|
|
30 |
on=["Model Name", "Cost and Speed: Flavor"],
|
31 |
)
|
32 |
|
33 |
-
leaderboard_cost_df = leaderboard_cost_df.join(ref_df.set_index("Model Name"), on="Model Name")
|
34 |
-
leaderboard_cost_df = leaderboard_cost_df[cost_cols].round(decimals=2)
|
35 |
-
|
36 |
leaderboard_ts_df = pd.read_csv(os.path.join(crm_results_path, "hf_leaderboard_ts.csv"))
|
37 |
leaderboard_ts__crm_bias_df = pd.read_csv(os.path.join(crm_results_path, "hf_leaderboard_crm_bias.csv"))
|
38 |
leaderboard_ts_df = leaderboard_ts_df[~leaderboard_ts_df["Model Name"].isin(sf_finetuned_models)]
|
@@ -64,4 +61,4 @@ def get_leaderboard_df_crm(
|
|
64 |
by=[AutoEvalColumn.accuracy_metric_average.name], ascending=False
|
65 |
)
|
66 |
leaderboard_accuracy_df = leaderboard_accuracy_df[accuracy_cols].round(decimals=2)
|
67 |
-
return leaderboard_accuracy_df,
|
|
|
6 |
|
7 |
|
8 |
def get_leaderboard_df_crm(
|
9 |
+
crm_results_path: str, accuracy_cols: list, ts_cols: list
|
10 |
) -> tuple[pd.DataFrame, pd.DataFrame]:
|
11 |
"""Creates a dataframe from all the individual experiment results"""
|
12 |
use_case_flavor_mapping_df = pd.read_csv(os.path.join(crm_results_path, "hf_leaderboard_flavor_mapping.csv"))
|
|
|
30 |
on=["Model Name", "Cost and Speed: Flavor"],
|
31 |
)
|
32 |
|
|
|
|
|
|
|
33 |
leaderboard_ts_df = pd.read_csv(os.path.join(crm_results_path, "hf_leaderboard_ts.csv"))
|
34 |
leaderboard_ts__crm_bias_df = pd.read_csv(os.path.join(crm_results_path, "hf_leaderboard_crm_bias.csv"))
|
35 |
leaderboard_ts_df = leaderboard_ts_df[~leaderboard_ts_df["Model Name"].isin(sf_finetuned_models)]
|
|
|
61 |
by=[AutoEvalColumn.accuracy_metric_average.name], ascending=False
|
62 |
)
|
63 |
leaderboard_accuracy_df = leaderboard_accuracy_df[accuracy_cols].round(decimals=2)
|
64 |
+
return leaderboard_accuracy_df, leaderboard_ts_df
|