Paul Hager commited on
Commit
37b23b1
·
1 Parent(s): 44e7954

claude test

Browse files
Files changed (3) hide show
  1. app.py +12 -15
  2. src/display/utils.py +19 -11
  3. src/leaderboard/read_evals.py +23 -21
app.py CHANGED
@@ -23,7 +23,15 @@ from src.display.utils import (
23
  WeightType,
24
  Precision,
25
  )
26
- from src.envs import API, EVAL_RESULTS_PATH_CDM, EVAL_RESULTS_PATH_CDM_FI, REPO_ID, RESULTS_REPO_CDM, RESULTS_REPO_CDM_FI, TOKEN
 
 
 
 
 
 
 
 
27
  from src.populate import get_leaderboard_df
28
 
29
 
@@ -62,6 +70,7 @@ except Exception:
62
  LEADERBOARD_DF_CDM = get_leaderboard_df(EVAL_RESULTS_PATH_CDM, COLS, BENCHMARK_COLS)
63
  LEADERBOARD_DF_CDM_FI = get_leaderboard_df(EVAL_RESULTS_PATH_CDM_FI, COLS, BENCHMARK_COLS)
64
 
 
65
  def init_leaderboard(dataframe):
66
  if dataframe is None or dataframe.empty:
67
  raise ValueError("Leaderboard DataFrame is empty or None.")
@@ -74,18 +83,6 @@ def init_leaderboard(dataframe):
74
  label="Select Columns to Display:",
75
  ),
76
  search_columns=[AutoEvalColumn.model.name],
77
- # hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
78
- # filter_columns=[
79
- # ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
80
- # ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
81
- # ColumnFilter(
82
- # AutoEvalColumn.seq_length.name,
83
- # type="checkboxgroup",
84
- # label="Sequence Lengths",
85
- # )
86
- # ColumnFilter(AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True),
87
- # ],
88
- # bool_checkboxgroup_label="Hide models",
89
  interactive=False,
90
  )
91
 
@@ -97,10 +94,10 @@ with demo:
97
 
98
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
99
  with gr.TabItem("MIMIC CDM", elem_id="llm-benchmark-tab-table", id=0):
100
- leaderboard = init_leaderboard(LEADERBOARD_DF_CDM)
101
 
102
  with gr.TabItem("MIMIC CDM FI", elem_id="llm-benchmark-tab-table", id=1):
103
- leaderboard = init_leaderboard(LEADERBOARD_DF_CDM_FI)
104
 
105
  with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
106
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
 
23
  WeightType,
24
  Precision,
25
  )
26
+ from src.envs import (
27
+ API,
28
+ EVAL_RESULTS_PATH_CDM,
29
+ EVAL_RESULTS_PATH_CDM_FI,
30
+ REPO_ID,
31
+ RESULTS_REPO_CDM,
32
+ RESULTS_REPO_CDM_FI,
33
+ TOKEN,
34
+ )
35
  from src.populate import get_leaderboard_df
36
 
37
 
 
70
  LEADERBOARD_DF_CDM = get_leaderboard_df(EVAL_RESULTS_PATH_CDM, COLS, BENCHMARK_COLS)
71
  LEADERBOARD_DF_CDM_FI = get_leaderboard_df(EVAL_RESULTS_PATH_CDM_FI, COLS, BENCHMARK_COLS)
72
 
73
+
74
  def init_leaderboard(dataframe):
75
  if dataframe is None or dataframe.empty:
76
  raise ValueError("Leaderboard DataFrame is empty or None.")
 
83
  label="Select Columns to Display:",
84
  ),
85
  search_columns=[AutoEvalColumn.model.name],
 
 
 
 
 
 
 
 
 
 
 
 
86
  interactive=False,
87
  )
88
 
 
94
 
95
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
96
  with gr.TabItem("MIMIC CDM", elem_id="llm-benchmark-tab-table", id=0):
97
+ leaderboard_cdm = init_leaderboard(LEADERBOARD_DF_CDM)
98
 
99
  with gr.TabItem("MIMIC CDM FI", elem_id="llm-benchmark-tab-table", id=1):
100
+ leaderboard_cdm_fi = init_leaderboard(LEADERBOARD_DF_CDM_FI)
101
 
102
  with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
103
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
src/display/utils.py CHANGED
@@ -5,6 +5,7 @@ import pandas as pd
5
 
6
  from src.about import Tasks
7
 
 
8
  def fields(raw_class):
9
  return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
10
 
@@ -20,15 +21,16 @@ class ColumnContent:
20
  hidden: bool = False
21
  never_hidden: bool = False
22
 
 
23
  ## Leaderboard columns
24
  auto_eval_column_dict = []
25
  # Init
26
  # auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
27
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
28
- #Scores
29
  auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
30
  for task in Tasks:
31
- auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
32
  # Model information
33
  # auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
34
  auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
@@ -37,7 +39,9 @@ auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Arch
37
  # auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
38
  auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
39
  auto_eval_column_dict.append(["seq_length", ColumnContent, ColumnContent("Max Sequence Length", "number", False)])
40
- auto_eval_column_dict.append(["model_quantization_bits", ColumnContent, ColumnContent("Quantization Bits", "number", False)])
 
 
41
  # auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
42
  auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
43
  # auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
@@ -45,6 +49,7 @@ auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Avai
45
  # We use make dataclass to dynamically fill the scores from Tasks
46
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
47
 
 
48
  ## For the queue columns in the submission tab
49
  @dataclass(frozen=True)
50
  class EvalQueueColumn: # Queue column
@@ -55,12 +60,13 @@ class EvalQueueColumn: # Queue column
55
  weight_type = ColumnContent("weight_type", "str", "Original")
56
  status = ColumnContent("status", "str", True)
57
 
 
58
  ## All the model information that we might need
59
  @dataclass
60
  class ModelDetails:
61
  name: str
62
  display_name: str = ""
63
- symbol: str = "" # emoji
64
 
65
 
66
  class ModelType(Enum):
@@ -85,18 +91,20 @@ class ModelType(Enum):
85
  return ModelType.IFT
86
  return ModelType.Unknown
87
 
 
88
  class WeightType(Enum):
89
  Adapter = ModelDetails("Adapter")
90
  Original = ModelDetails("Original")
91
  Delta = ModelDetails("Delta")
92
 
 
93
  class Precision(Enum):
94
  float16 = ModelDetails("float16")
95
  bfloat16 = ModelDetails("bfloat16")
96
  float32 = ModelDetails("float32")
97
- #qt_8bit = ModelDetails("8bit")
98
- #qt_4bit = ModelDetails("4bit")
99
- #qt_GPTQ = ModelDetails("GPTQ")
100
  Unknown = ModelDetails("?")
101
 
102
  def from_str(precision):
@@ -106,14 +114,15 @@ class Precision(Enum):
106
  return Precision.bfloat16
107
  if precision in ["float32"]:
108
  return Precision.float32
109
- #if precision in ["8bit"]:
110
  # return Precision.qt_8bit
111
- #if precision in ["4bit"]:
112
  # return Precision.qt_4bit
113
- #if precision in ["GPTQ", "None"]:
114
  # return Precision.qt_GPTQ
115
  return Precision.Unknown
116
 
 
117
  # Column selection
118
  COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
119
 
@@ -121,4 +130,3 @@ EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
121
  EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
122
 
123
  BENCHMARK_COLS = [t.value.col_name for t in Tasks]
124
-
 
5
 
6
  from src.about import Tasks
7
 
8
+
9
  def fields(raw_class):
10
  return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
11
 
 
21
  hidden: bool = False
22
  never_hidden: bool = False
23
 
24
+
25
  ## Leaderboard columns
26
  auto_eval_column_dict = []
27
  # Init
28
  # auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
29
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
30
+ # Scores
31
  auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
32
  for task in Tasks:
33
+ auto_eval_column_dict.append([task.value.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
34
  # Model information
35
  # auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
36
  auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
 
39
  # auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
40
  auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
41
  auto_eval_column_dict.append(["seq_length", ColumnContent, ColumnContent("Max Sequence Length", "number", False)])
42
+ auto_eval_column_dict.append(
43
+ ["model_quantization_bits", ColumnContent, ColumnContent("Quantization Bits", "number", False)]
44
+ )
45
  # auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
46
  auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
47
  # auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
 
49
  # We use make dataclass to dynamically fill the scores from Tasks
50
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
51
 
52
+
53
  ## For the queue columns in the submission tab
54
  @dataclass(frozen=True)
55
  class EvalQueueColumn: # Queue column
 
60
  weight_type = ColumnContent("weight_type", "str", "Original")
61
  status = ColumnContent("status", "str", True)
62
 
63
+
64
  ## All the model information that we might need
65
  @dataclass
66
  class ModelDetails:
67
  name: str
68
  display_name: str = ""
69
+ symbol: str = "" # emoji
70
 
71
 
72
  class ModelType(Enum):
 
91
  return ModelType.IFT
92
  return ModelType.Unknown
93
 
94
+
95
  class WeightType(Enum):
96
  Adapter = ModelDetails("Adapter")
97
  Original = ModelDetails("Original")
98
  Delta = ModelDetails("Delta")
99
 
100
+
101
  class Precision(Enum):
102
  float16 = ModelDetails("float16")
103
  bfloat16 = ModelDetails("bfloat16")
104
  float32 = ModelDetails("float32")
105
+ # qt_8bit = ModelDetails("8bit")
106
+ # qt_4bit = ModelDetails("4bit")
107
+ # qt_GPTQ = ModelDetails("GPTQ")
108
  Unknown = ModelDetails("?")
109
 
110
  def from_str(precision):
 
114
  return Precision.bfloat16
115
  if precision in ["float32"]:
116
  return Precision.float32
117
+ # if precision in ["8bit"]:
118
  # return Precision.qt_8bit
119
+ # if precision in ["4bit"]:
120
  # return Precision.qt_4bit
121
+ # if precision in ["GPTQ", "None"]:
122
  # return Precision.qt_GPTQ
123
  return Precision.Unknown
124
 
125
+
126
  # Column selection
127
  COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
128
 
 
130
  EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
131
 
132
  BENCHMARK_COLS = [t.value.col_name for t in Tasks]
 
src/leaderboard/read_evals.py CHANGED
@@ -13,28 +13,35 @@ from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, Weigh
13
  from transformers import AutoConfig
14
  from transformers.models.auto.tokenization_auto import AutoTokenizer
15
 
16
- def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False) -> tuple[bool, str]:
 
 
 
17
  """Checks if the model model_name is on the hub, and whether it (and its tokenizer) can be loaded with AutoClasses."""
18
  try:
19
- config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
 
 
20
  if test_tokenizer:
21
  try:
22
- tk = AutoTokenizer.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
 
 
23
  except ValueError as e:
 
 
24
  return (
25
  False,
26
- f"uses a tokenizer which is not in a transformers release: {e}",
27
- None
28
  )
29
- except Exception as e:
30
- return (False, "'s tokenizer cannot be loaded. Is your tokenizer class in a stable transformers release, and correctly configured?", None)
31
  return True, None, config
32
 
33
  except ValueError:
34
  return (
35
  False,
36
  "needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard.",
37
- None
38
  )
39
 
40
  except Exception as e:
@@ -116,7 +123,6 @@ class EvalResult:
116
  model_quantization_bits = config.get("model_quantization_bits", 0)
117
  # print(self.seq_length)
118
 
119
-
120
  return self(
121
  eval_name=result_key,
122
  full_model=full_model,
@@ -128,7 +134,7 @@ class EvalResult:
128
  still_on_hub=still_on_hub,
129
  architecture=architecture,
130
  seq_length=seq_length,
131
- model_quantization_bits=model_quantization_bits
132
  )
133
 
134
  def update_with_request_file(self, requests_path):
@@ -151,28 +157,24 @@ class EvalResult:
151
 
152
  def to_dict(self):
153
  """Converts the Eval Result to a dict compatible with our dataframe display"""
154
- # print(self.seq_length)
155
  average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
156
  data_dict = {
157
- "eval_name": self.eval_name, # not a column, just a save name,
158
- # AutoEvalColumn.precision.name: self.precision.value.name,
159
- # AutoEvalColumn.model_type.name: self.model_type.value.name,
160
- # AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
161
- # AutoEvalColumn.weight_type.name: self.weight_type.value.name,
162
  AutoEvalColumn.architecture.name: self.architecture,
163
  AutoEvalColumn.model.name: make_clickable_model(self.full_model),
164
- # AutoEvalColumn.revision.name: self.revision,
165
- AutoEvalColumn.average.name: average,
166
- # AutoEvalColumn.license.name: self.license,
167
- # AutoEvalColumn.likes.name: self.likes,
168
  AutoEvalColumn.params.name: self.params,
169
  AutoEvalColumn.seq_length.name: self.seq_length,
170
  AutoEvalColumn.model_quantization_bits.name: self.model_quantization_bits,
171
  AutoEvalColumn.still_on_hub.name: self.still_on_hub,
172
  }
173
 
 
174
  for task in Tasks:
175
- data_dict[task.value.col_name] = self.results[task.value.benchmark]
 
 
 
176
 
177
  return data_dict
178
 
 
13
  from transformers import AutoConfig
14
  from transformers.models.auto.tokenization_auto import AutoTokenizer
15
 
16
+
17
+ def is_model_on_hub(
18
+ model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False
19
+ ) -> tuple[bool, str]:
20
  """Checks if the model model_name is on the hub, and whether it (and its tokenizer) can be loaded with AutoClasses."""
21
  try:
22
+ config = AutoConfig.from_pretrained(
23
+ model_name, revision=revision, trust_remote_code=trust_remote_code, token=token
24
+ )
25
  if test_tokenizer:
26
  try:
27
+ tk = AutoTokenizer.from_pretrained(
28
+ model_name, revision=revision, trust_remote_code=trust_remote_code, token=token
29
+ )
30
  except ValueError as e:
31
+ return (False, f"uses a tokenizer which is not in a transformers release: {e}", None)
32
+ except Exception as e:
33
  return (
34
  False,
35
+ "'s tokenizer cannot be loaded. Is your tokenizer class in a stable transformers release, and correctly configured?",
36
+ None,
37
  )
 
 
38
  return True, None, config
39
 
40
  except ValueError:
41
  return (
42
  False,
43
  "needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard.",
44
+ None,
45
  )
46
 
47
  except Exception as e:
 
123
  model_quantization_bits = config.get("model_quantization_bits", 0)
124
  # print(self.seq_length)
125
 
 
126
  return self(
127
  eval_name=result_key,
128
  full_model=full_model,
 
134
  still_on_hub=still_on_hub,
135
  architecture=architecture,
136
  seq_length=seq_length,
137
+ model_quantization_bits=model_quantization_bits,
138
  )
139
 
140
  def update_with_request_file(self, requests_path):
 
157
 
158
  def to_dict(self):
159
  """Converts the Eval Result to a dict compatible with our dataframe display"""
 
160
  average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
161
  data_dict = {
162
+ "eval_name": self.eval_name, # not a column, just a save name
 
 
 
 
163
  AutoEvalColumn.architecture.name: self.architecture,
164
  AutoEvalColumn.model.name: make_clickable_model(self.full_model),
165
+ AutoEvalColumn.average.name: round(average, 2), # Round to 2 decimal places
 
 
 
166
  AutoEvalColumn.params.name: self.params,
167
  AutoEvalColumn.seq_length.name: self.seq_length,
168
  AutoEvalColumn.model_quantization_bits.name: self.model_quantization_bits,
169
  AutoEvalColumn.still_on_hub.name: self.still_on_hub,
170
  }
171
 
172
+ # Add task results
173
  for task in Tasks:
174
+ if task.value.benchmark in self.results:
175
+ data_dict[task.value.col_name] = round(self.results[task.value.benchmark], 2)
176
+ else:
177
+ data_dict[task.value.col_name] = None
178
 
179
  return data_dict
180