BearSean commited on
Commit
9c13f5f
Β·
verified Β·
1 Parent(s): af4234d
src/display/about.py CHANGED
@@ -2,7 +2,7 @@ from src.display.utils import ModelType
2
 
3
 
4
  TITLE = """<img src="https://upstage-open-ko-llm-leaderboard-logos.s3.ap-northeast-2.amazonaws.com/header_logo.png" style="width:30%;display:block;margin-left:auto;margin-right:auto">"""
5
- BOTTOM_LOGO = """<img src="https://upstage-open-ko-llm-leaderboard-logos.s3.ap-northeast-2.amazonaws.com/footer_logo_240604.png" style="width:50%;display:block;margin-left:auto;margin-right:auto">"""
6
 
7
  INTRODUCTION_TEXT = f"""
8
  πŸš€ The Open Ko-LLM Leaderboard πŸ‡°πŸ‡· objectively evaluates the performance of Korean Large Language Model (LLM).
@@ -33,7 +33,7 @@ Please provide information about the model through an issue! 🀩
33
 
34
  πŸ“ˆ We evaluate models using the [Eleuther AI Language Model Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness), a unified framework to test generative language models on a large number of different evaluation tasks.
35
 
36
- We have set up a benchmark using datasets translated into Korean, and applied variations by human experts, from the six tasks (HellaSwag, MMLU, Arc, Truthful QA, Winogrande, GSM8k) operated by HuggingFace OpenLLM. We have also added a new dataset prepared from scratch.
37
  - Ko-HellaSwag (provided by __[Upstage](https://www.upstage.ai/)__, machine translation)
38
  - Ko-MMLU (provided by __[Upstage](https://www.upstage.ai/)__, human translation and variation)
39
  - Ko-Arc (provided by __[Upstage](https://www.upstage.ai/)__, human translation and variation)
@@ -41,8 +41,14 @@ We have set up a benchmark using datasets translated into Korean, and applied va
41
  - Ko-Winogrande (provided by __[Flitto](https://www.flitto.com/portal/en)__, human translation and variation)
42
  - Ko-GSM8k (provided by __[Flitto](https://www.flitto.com/portal/en)__, human translation and variation)
43
  - Ko-CommonGen V2 (provided by __[Korea University NLP&AI Lab](http://nlp.korea.ac.kr/)__, created from scratch)
44
-
45
- To provide an evaluation befitting the LLM era, we've selected benchmark datasets suitable for assessing these elements: expertise, inference, hallucination, and common sense. The final score is converted to the average score from each evaluation datasets.
 
 
 
 
 
 
46
 
47
  GPUs are provided by __[KT](https://cloud.kt.com/)__ for the evaluations.
48
 
 
2
 
3
 
4
  TITLE = """<img src="https://upstage-open-ko-llm-leaderboard-logos.s3.ap-northeast-2.amazonaws.com/header_logo.png" style="width:30%;display:block;margin-left:auto;margin-right:auto">"""
5
+ BOTTOM_LOGO = """<img src="https://upstage-open-ko-llm-leaderboard-logos.s3.ap-northeast-2.amazonaws.com/footer_logo_240715.png" style="width:50%;display:block;margin-left:auto;margin-right:auto">"""
6
 
7
  INTRODUCTION_TEXT = f"""
8
  πŸš€ The Open Ko-LLM Leaderboard πŸ‡°πŸ‡· objectively evaluates the performance of Korean Large Language Model (LLM).
 
33
 
34
  πŸ“ˆ We evaluate models using the [Eleuther AI Language Model Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness), a unified framework to test generative language models on a large number of different evaluation tasks.
35
 
36
+ We have set up a benchmark using datasets translated into Korean, and applied variations by human experts, from the six tasks (HellaSwag, MMLU, Arc, Truthful QA, Winogrande, GSM8k) operated by __HuggingFace [Open LLM Leaderboard](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard)__. We have also added a new dataset prepared from scratch.
37
  - Ko-HellaSwag (provided by __[Upstage](https://www.upstage.ai/)__, machine translation)
38
  - Ko-MMLU (provided by __[Upstage](https://www.upstage.ai/)__, human translation and variation)
39
  - Ko-Arc (provided by __[Upstage](https://www.upstage.ai/)__, human translation and variation)
 
41
  - Ko-Winogrande (provided by __[Flitto](https://www.flitto.com/portal/en)__, human translation and variation)
42
  - Ko-GSM8k (provided by __[Flitto](https://www.flitto.com/portal/en)__, human translation and variation)
43
  - Ko-CommonGen V2 (provided by __[Korea University NLP&AI Lab](http://nlp.korea.ac.kr/)__, created from scratch)
44
+ - Ko-EQ Bench (provided by __[Flitto](https://www.flitto.com/portal/en)__, human translation and variation)
45
+ - Ko-InstFollow (provided by __[Flitto](https://www.flitto.com/portal/en)__, human translation and variation)
46
+ - KorNAT-CKA (provided by __[SELECTSTAR](https://selectstar.ai/ko/)__ and __[KAIST AI](https://gsai.kaist.ac.kr/?lang=ko&ckattempt=1)__, created from scratch)
47
+ - KorNAT-SVA (provided by __[SELECTSTAR](https://selectstar.ai/ko/)__ and __[KAIST AI](https://gsai.kaist.ac.kr/?lang=ko&ckattempt=1)__, created from scratch)
48
+ - Ko-Harmlessness (provided by __[SELECTSTAR](https://selectstar.ai/ko/)__ and __[KAIST AI](https://gsai.kaist.ac.kr/?lang=ko&ckattempt=1)__, created from scratch)
49
+ - Ko-Helpfulness (provided by __[SELECTSTAR](https://selectstar.ai/ko/)__ and __[KAIST AI](https://gsai.kaist.ac.kr/?lang=ko&ckattempt=1)__, created from scratch)
50
+
51
+ To provide an evaluation befitting the LLM era, we've selected benchmark datasets suitable for assessing these elements: expertise, inference, hallucination, truthfulness and common sense. The final score is converted to the average score from each evaluation datasets.
52
 
53
  GPUs are provided by __[KT](https://cloud.kt.com/)__ for the evaluations.
54
 
src/display/utils.py CHANGED
@@ -21,6 +21,13 @@ class Tasks(Enum):
21
  winogrande = Task("ko_winogrande", "acc_norm", "Ko-Winogrande")
22
  gsm8k = Task("ko_gsm8k", "acc_norm", "Ko-GSM8k")
23
  commongen_v2 = Task("ko_commongen_v2", "acc_norm", "Ko-CommonGen V2")
 
 
 
 
 
 
 
24
 
25
  # These classes are for user facing column names,
26
  # to avoid having to change them all around the code
 
21
  winogrande = Task("ko_winogrande", "acc_norm", "Ko-Winogrande")
22
  gsm8k = Task("ko_gsm8k", "acc_norm", "Ko-GSM8k")
23
  commongen_v2 = Task("ko_commongen_v2", "acc_norm", "Ko-CommonGen V2")
24
+ eqBench = Task("ko_eq_bench", "acc_norm", "Ko-EQ Bench")
25
+ instFollow = Task("ko_inst_follow", "acc_norm", "Ko-InstFollow")
26
+ korNatCka = Task("kor_nat_cka", "acc_norm", "KorNAT-CKA")
27
+ korNatSva = Task("kor_nat_sva", "acc_norm", "KorNAT-SVA")
28
+ harmlessness = Task("ko_harmlessness", "acc_norm", "Ko-Harmlessness")
29
+ helpfulness = Task("ko_helpfulness", "acc_norm", "Ko-Helpfulness")
30
+
31
 
32
  # These classes are for user facing column names,
33
  # to avoid having to change them all around the code
src/leaderboard/read_evals.py CHANGED
@@ -103,10 +103,12 @@ class EvalResult:
103
  results[task.benchmark] = 0.0
104
  continue
105
 
106
- # Two new tasks have been added, we need to skip them for now
107
- if task.benchmark == "ko_winogrande" or task.benchmark == "ko_gsm8k":
108
- results[task.benchmark] = 0.0
109
- continue
 
 
110
 
111
  # We average all scores of a given metric (mostly for mmlu)
112
  accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark in k])
@@ -148,16 +150,29 @@ class EvalResult:
148
 
149
  def to_dict(self):
150
  """Converts the Eval Result to a dict compatible with our dataframe display"""
151
-
152
- # Skip the two new tasks for now
153
- # TODO: safely remove this code when the task results are added
154
  skip_avg_len = 0
155
  if self.results['ko_winogrande'] == 0.0:
156
  skip_avg_len += 1
157
  if self.results['ko_gsm8k'] == 0.0:
158
  skip_avg_len += 1
 
 
 
 
 
 
 
 
 
 
 
 
159
 
160
  average = sum([v for v in self.results.values() if v is not None]) / (len(Tasks) - skip_avg_len)
 
161
  data_dict = {
162
  "eval_name": self.eval_name, # not a column, just a save name,
163
  AutoEvalColumn.precision.name: self.precision.value.name,
 
103
  results[task.benchmark] = 0.0
104
  continue
105
 
106
+ # New tasks have been added, we need to skip them if not exists
107
+ if task.benchmark in ["ko_winogrande", "ko_gsm8k", "ko_eq_bench", "ko_inst_follow", "kor_nat_cka", "kor_nat_sva", "ko_harmlessness", "ko_helpfulness"]:
108
+ accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark in k])
109
+ if accs.size == 0 or any([acc is None for acc in accs]):
110
+ results[task.benchmark] = 0.0
111
+ continue
112
 
113
  # We average all scores of a given metric (mostly for mmlu)
114
  accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark in k])
 
150
 
151
  def to_dict(self):
152
  """Converts the Eval Result to a dict compatible with our dataframe display"""
153
+
154
+ # Skip the new tasks for now
155
+ # TODO: safely remove this code when the task results are all added
156
  skip_avg_len = 0
157
  if self.results['ko_winogrande'] == 0.0:
158
  skip_avg_len += 1
159
  if self.results['ko_gsm8k'] == 0.0:
160
  skip_avg_len += 1
161
+ if self.results['ko_eq_bench'] == 0.0:
162
+ skip_avg_len += 1
163
+ if self.results['ko_inst_follow'] == 0.0:
164
+ skip_avg_len += 1
165
+ if self.results['kor_nat_cka'] == 0.0:
166
+ skip_avg_len += 1
167
+ if self.results['kor_nat_sva'] == 0.0:
168
+ skip_avg_len += 1
169
+ if self.results['ko_harmlessness'] == 0.0:
170
+ skip_avg_len += 1
171
+ if self.results['ko_helpfulness'] == 0.0:
172
+ skip_avg_len += 1
173
 
174
  average = sum([v for v in self.results.values() if v is not None]) / (len(Tasks) - skip_avg_len)
175
+
176
  data_dict = {
177
  "eval_name": self.eval_name, # not a column, just a save name,
178
  AutoEvalColumn.precision.name: self.precision.value.name,
src/tools/plots.py CHANGED
@@ -41,6 +41,19 @@ def create_scores_df(raw_data: list[EvalResult]) -> pd.DataFrame:
41
  avg_skip_len += 1
42
  if row["results"]["ko_gsm8k"] == 0.0:
43
  avg_skip_len += 1
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  current_score = np.sum(list(row["results"].values())) / (len(row["results"]) - avg_skip_len)
45
  else:
46
  current_score = row["results"][task.benchmark]
 
41
  avg_skip_len += 1
42
  if row["results"]["ko_gsm8k"] == 0.0:
43
  avg_skip_len += 1
44
+ if row["results"]["ko_eq_bench"] == 0.0:
45
+ avg_skip_len += 1
46
+ if row["results"]["ko_inst_follow"] == 0.0:
47
+ avg_skip_len += 1
48
+ if row["results"]["kor_nat_cka"] == 0.0:
49
+ avg_skip_len += 1
50
+ if row["results"]["kor_nat_sva"] == 0.0:
51
+ avg_skip_len += 1
52
+ if row["results"]["ko_harmlessness"] == 0.0:
53
+ avg_skip_len += 1
54
+ if row["results"]["ko_helpfulness"] == 0.0:
55
+ avg_skip_len += 1
56
+
57
  current_score = np.sum(list(row["results"].values())) / (len(row["results"]) - avg_skip_len)
58
  else:
59
  current_score = row["results"][task.benchmark]