yibum commited on
Commit
098bb60
·
1 Parent(s): 84ee137

remove legacy cost tablea tab

Browse files
Files changed (2) hide show
  1. app.py +3 -148
  2. src/populate.py +2 -5
app.py CHANGED
@@ -1,36 +1,15 @@
1
  import gradio as gr
2
  import pandas as pd
3
 
4
- from src.about import ( # CITATION_BUTTON_LABEL,; CITATION_BUTTON_TEXT,; EVALUATION_QUEUE_TEXT,
5
- CITATION_BUTTON_LABEL,
6
- CITATION_BUTTON_TEXT,
7
- INTRODUCTION_TEXT,
8
- LLM_BENCHMARKS_TEXT,
9
- TITLE,
10
- )
11
  from src.display.css_html_js import custom_css
12
- from src.display.utils import ( # EVAL_TYPES,; WeightType,; BENCHMARK_COLS,; EVAL_COLS,; NUMERIC_INTERVALS,; ModelType,; Precision,
13
- COLS,
14
- COST_COLS,
15
- COST_TYPES,
16
- TS_COLS,
17
- TS_TYPES,
18
- TYPES,
19
- AutoEvalColumn,
20
- CostEvalColumn,
21
- TSEvalColumn,
22
- fields,
23
- )
24
-
25
- # from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
26
  from src.envs import CRM_RESULTS_PATH
27
  from src.populate import get_leaderboard_df_crm
28
 
29
- original_df, cost_df, ts_df = get_leaderboard_df_crm(CRM_RESULTS_PATH, COLS, COST_COLS)
30
 
31
- # raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
32
  leaderboard_df = original_df.copy()
33
- leaderboard_cost_df = cost_df.copy()
34
  leaderboard_ts_df = ts_df.copy()
35
  # leaderboard_df = leaderboard_df.style.format({"accuracy_metric_average": "{0:.2f}"})
36
 
@@ -60,20 +39,6 @@ def update_table(
60
  return df.style.map(highlight_cost_band_low, props="background-color: #b3d5a4")
61
 
62
 
63
- def update_cost_table(
64
- hidden_df: pd.DataFrame,
65
- columns: list,
66
- llm_query: list,
67
- llm_provider_query: list,
68
- use_case_flavor_query: list,
69
- ):
70
- filtered_df = filter_llm_func(hidden_df, llm_query)
71
- filtered_df = filter_llm_provider_func(filtered_df, llm_provider_query)
72
- filtered_df = filter_use_case_flavor_func(filtered_df, use_case_flavor_query)
73
- df = select_columns_cost_table(filtered_df, columns)
74
- return df.style.map(highlight_cost_band_low, props="background-color: #b3d5a4")
75
-
76
-
77
  def update_ts_table(
78
  hidden_df: pd.DataFrame,
79
  columns: list,
@@ -125,23 +90,6 @@ def init_leaderboard_df(
125
  )
126
 
127
 
128
- def init_leaderboard_cost_df(
129
- leaderboard_df: pd.DataFrame,
130
- columns: list,
131
- llm_query: list,
132
- llm_provider_query: list,
133
- use_case_type_query: list,
134
- ):
135
-
136
- return update_cost_table(
137
- leaderboard_df,
138
- columns,
139
- llm_query,
140
- llm_provider_query,
141
- use_case_type_query,
142
- )
143
-
144
-
145
  def init_leaderboard_ts_df(
146
  leaderboard_df: pd.DataFrame,
147
  columns: list,
@@ -183,10 +131,6 @@ def filter_use_case_type_func(df: pd.DataFrame, use_case_type_query: list) -> pd
183
  return df[df["Use Case Type"].isin(use_case_type_query)]
184
 
185
 
186
- def filter_use_case_flavor_func(df: pd.DataFrame, use_case_flavor_query: list) -> pd.DataFrame:
187
- return df[df["Cost and Speed: Flavor"].isin(use_case_flavor_query)]
188
-
189
-
190
  def filter_llm_func(df: pd.DataFrame, llm_query: list) -> pd.DataFrame:
191
  return df[df["Model Name"].isin(llm_query)]
192
 
@@ -204,14 +148,6 @@ def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
204
  return filtered_df
205
 
206
 
207
- def select_columns_cost_table(df: pd.DataFrame, columns: list) -> pd.DataFrame:
208
- always_here_cols = [
209
- CostEvalColumn.model.name,
210
- ]
211
- filtered_df = df[always_here_cols + [c for c in COST_COLS if c in df.columns and c in columns]]
212
- return filtered_df
213
-
214
-
215
  def select_columns_ts_table(df: pd.DataFrame, columns: list) -> pd.DataFrame:
216
  always_here_cols = [
217
  TSEvalColumn.model.name,
@@ -423,87 +359,6 @@ with demo:
423
  leaderboard_table,
424
  queue=True,
425
  )
426
- with gr.TabItem("🏅 Latency & Cost", elem_id="llm-benchmark-tab-table", id=1):
427
- with gr.Row():
428
- with gr.Column():
429
- with gr.Row():
430
- shown_columns = gr.CheckboxGroup(
431
- choices=[c.name for c in fields(CostEvalColumn) if not c.hidden and not c.never_hidden],
432
- value=[
433
- c.name
434
- for c in fields(CostEvalColumn)
435
- if c.displayed_by_default and not c.hidden and not c.never_hidden
436
- ],
437
- label="Select columns to show",
438
- elem_id="column-select",
439
- interactive=True,
440
- )
441
- with gr.Row():
442
- with gr.Column():
443
- filter_llm = gr.CheckboxGroup(
444
- choices=list(cost_df["Model Name"].unique()),
445
- value=list(cost_df["Model Name"].unique()),
446
- label="Model Name",
447
- info="",
448
- interactive=True,
449
- )
450
- with gr.Column():
451
- filter_llm_provider = gr.CheckboxGroup(
452
- choices=list(cost_df["LLM Provider"].unique()),
453
- value=list(cost_df["LLM Provider"].unique()),
454
- label="LLM Provider",
455
- info="",
456
- interactive=True,
457
- )
458
- with gr.Column():
459
- filter_use_case_type = gr.CheckboxGroup(
460
- choices=["Long", "Short"],
461
- value=["Long", "Short"],
462
- label="Use Case Flavor",
463
- info="Output: 250 tokens, Long input: 3k tokens, Short input: 500 tokens",
464
- interactive=True,
465
- )
466
-
467
- leaderboard_table = gr.components.Dataframe(
468
- value=init_leaderboard_cost_df(
469
- leaderboard_cost_df,
470
- shown_columns.value,
471
- filter_llm.value,
472
- filter_llm_provider.value,
473
- filter_use_case_type.value,
474
- ),
475
- headers=[c.name for c in fields(CostEvalColumn) if c.never_hidden] + shown_columns.value,
476
- datatype=COST_TYPES,
477
- elem_id="leaderboard-table",
478
- interactive=False,
479
- visible=True,
480
- )
481
-
482
- hidden_leaderboard_table_for_search = gr.components.Dataframe(
483
- value=cost_df[COST_COLS],
484
- headers=COST_COLS,
485
- datatype=COST_TYPES,
486
- visible=False,
487
- )
488
-
489
- for selector in [
490
- shown_columns,
491
- filter_llm,
492
- filter_llm_provider,
493
- filter_use_case_type,
494
- ]:
495
- selector.change(
496
- update_cost_table,
497
- [
498
- hidden_leaderboard_table_for_search,
499
- shown_columns,
500
- filter_llm,
501
- filter_llm_provider,
502
- filter_use_case_type,
503
- ],
504
- leaderboard_table,
505
- queue=True,
506
- )
507
  with gr.TabItem("🏅 Trust & Safety", elem_id="llm-benchmark-tab-table", id=2):
508
  with gr.Row():
509
  with gr.Column():
 
1
  import gradio as gr
2
  import pandas as pd
3
 
4
+ from src.about import CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT, INTRODUCTION_TEXT, LLM_BENCHMARKS_TEXT, TITLE
 
 
 
 
 
 
5
  from src.display.css_html_js import custom_css
6
+ from src.display.utils import COLS, TS_COLS, TS_TYPES, TYPES, AutoEvalColumn, TSEvalColumn, fields
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  from src.envs import CRM_RESULTS_PATH
8
  from src.populate import get_leaderboard_df_crm
9
 
10
+ original_df, ts_df = get_leaderboard_df_crm(CRM_RESULTS_PATH, COLS, TS_COLS)
11
 
 
12
  leaderboard_df = original_df.copy()
 
13
  leaderboard_ts_df = ts_df.copy()
14
  # leaderboard_df = leaderboard_df.style.format({"accuracy_metric_average": "{0:.2f}"})
15
 
 
39
  return df.style.map(highlight_cost_band_low, props="background-color: #b3d5a4")
40
 
41
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  def update_ts_table(
43
  hidden_df: pd.DataFrame,
44
  columns: list,
 
90
  )
91
 
92
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  def init_leaderboard_ts_df(
94
  leaderboard_df: pd.DataFrame,
95
  columns: list,
 
131
  return df[df["Use Case Type"].isin(use_case_type_query)]
132
 
133
 
 
 
 
 
134
  def filter_llm_func(df: pd.DataFrame, llm_query: list) -> pd.DataFrame:
135
  return df[df["Model Name"].isin(llm_query)]
136
 
 
148
  return filtered_df
149
 
150
 
 
 
 
 
 
 
 
 
151
  def select_columns_ts_table(df: pd.DataFrame, columns: list) -> pd.DataFrame:
152
  always_here_cols = [
153
  TSEvalColumn.model.name,
 
359
  leaderboard_table,
360
  queue=True,
361
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
362
  with gr.TabItem("🏅 Trust & Safety", elem_id="llm-benchmark-tab-table", id=2):
363
  with gr.Row():
364
  with gr.Column():
src/populate.py CHANGED
@@ -6,7 +6,7 @@ from src.display.utils import AutoEvalColumn
6
 
7
 
8
  def get_leaderboard_df_crm(
9
- crm_results_path: str, accuracy_cols: list, cost_cols: list
10
  ) -> tuple[pd.DataFrame, pd.DataFrame]:
11
  """Creates a dataframe from all the individual experiment results"""
12
  use_case_flavor_mapping_df = pd.read_csv(os.path.join(crm_results_path, "hf_leaderboard_flavor_mapping.csv"))
@@ -30,9 +30,6 @@ def get_leaderboard_df_crm(
30
  on=["Model Name", "Cost and Speed: Flavor"],
31
  )
32
 
33
- leaderboard_cost_df = leaderboard_cost_df.join(ref_df.set_index("Model Name"), on="Model Name")
34
- leaderboard_cost_df = leaderboard_cost_df[cost_cols].round(decimals=2)
35
-
36
  leaderboard_ts_df = pd.read_csv(os.path.join(crm_results_path, "hf_leaderboard_ts.csv"))
37
  leaderboard_ts__crm_bias_df = pd.read_csv(os.path.join(crm_results_path, "hf_leaderboard_crm_bias.csv"))
38
  leaderboard_ts_df = leaderboard_ts_df[~leaderboard_ts_df["Model Name"].isin(sf_finetuned_models)]
@@ -64,4 +61,4 @@ def get_leaderboard_df_crm(
64
  by=[AutoEvalColumn.accuracy_metric_average.name], ascending=False
65
  )
66
  leaderboard_accuracy_df = leaderboard_accuracy_df[accuracy_cols].round(decimals=2)
67
- return leaderboard_accuracy_df, leaderboard_cost_df, leaderboard_ts_df
 
6
 
7
 
8
  def get_leaderboard_df_crm(
9
+ crm_results_path: str, accuracy_cols: list, ts_cols: list
10
  ) -> tuple[pd.DataFrame, pd.DataFrame]:
11
  """Creates a dataframe from all the individual experiment results"""
12
  use_case_flavor_mapping_df = pd.read_csv(os.path.join(crm_results_path, "hf_leaderboard_flavor_mapping.csv"))
 
30
  on=["Model Name", "Cost and Speed: Flavor"],
31
  )
32
 
 
 
 
33
  leaderboard_ts_df = pd.read_csv(os.path.join(crm_results_path, "hf_leaderboard_ts.csv"))
34
  leaderboard_ts__crm_bias_df = pd.read_csv(os.path.join(crm_results_path, "hf_leaderboard_crm_bias.csv"))
35
  leaderboard_ts_df = leaderboard_ts_df[~leaderboard_ts_df["Model Name"].isin(sf_finetuned_models)]
 
61
  by=[AutoEvalColumn.accuracy_metric_average.name], ascending=False
62
  )
63
  leaderboard_accuracy_df = leaderboard_accuracy_df[accuracy_cols].round(decimals=2)
64
+ return leaderboard_accuracy_df, leaderboard_ts_df