sh1gechan commited on
Commit
a85bc80
·
1 Parent(s): 39a5dc9

AVGに関する説明を追加

Browse files
Files changed (1) hide show
  1. src/about.py +31 -9
src/about.py CHANGED
@@ -45,11 +45,11 @@ class Tasks(Enum):
45
  SUM = Task("scores", "SUM", "AVG (SUM)", TaskType.SUM, True) # Summarization - 要約
46
  alt_e_to_j_bert_score_ja_f1 = Task("scores", "alt-e-to-j_bert_score_ja_f1", "ALT E to J BERT Score", TaskType.MT)
47
  alt_e_to_j_bleu_ja = Task("scores", "alt-e-to-j_bleu_ja", "ALT E to J BLEU", TaskType.MT)
48
- alt_e_to_j_comet_wmt22 = Task("scores", "alt-e-to-j_comet_wmt22", "ALT E to J COMET WMT22", TaskType.MT)
49
  alt_j_to_e_bert_score_en_f1 = Task("scores", "alt-j-to-e_bert_score_en_f1", "ALT J to E BERT Score", TaskType.MT)
50
  alt_j_to_e_bleu_en = Task("scores", "alt-j-to-e_bleu_en", "ALT J to E BLEU", TaskType.MT)
51
- alt_j_to_e_comet_wmt22 = Task("scores", "alt-j-to-e_comet_wmt22", "ALT J to E COMET WMT22", TaskType.MT)
52
- chabsa_set_f1 = Task("scores", "chabsa_set_f1", "ChABSA", TaskType.EL)
53
  commonsensemoralja_exact_match = Task(
54
  "scores", "commonsensemoralja_exact_match", "CommonSenseMoralJA", TaskType.MC
55
  )
@@ -61,7 +61,7 @@ class Tasks(Enum):
61
  jnli_exact_match = Task("scores", "jnli_exact_match", "JNLI", TaskType.NLI)
62
  jsem_exact_match = Task("scores", "jsem_exact_match", "JSEM", TaskType.NLI)
63
  jsick_exact_match = Task("scores", "jsick_exact_match", "JSICK", TaskType.NLI)
64
- jsquad_char_f1 = Task("scores", "jsquad_char_f1", "JSquad", TaskType.RC)
65
  jsts_pearson = Task(
66
  "scores", "jsts_pearson", "JSTS (Pearson)", TaskType.STS
67
  ) # Semantic Textual Similarity - 意味的類似度
@@ -69,8 +69,8 @@ class Tasks(Enum):
69
  "scores", "jsts_spearman", "JSTS (Spearman)", TaskType.STS
70
  ) # Semantic Textual Similarity - 意味的類似度
71
  kuci_exact_match = Task("scores", "kuci_exact_match", "KUCI", TaskType.MC)
72
- mawps_exact_match = Task("scores", "mawps_exact_match", "MAWPS", TaskType.MR)
73
- mbpp_code_exec = Task("scores", "mbpp_code_exec", "MBPP (exec)", TaskType.CG)
74
  mbpp_pylint_check = Task("scores", "mbpp_pylint_check", "MBPP (pylint)", TaskType.CG)
75
  mmlu_en_exact_match = Task("scores", "mmlu_en_exact_match", "MMLU", TaskType.HE)
76
  niilc_char_f1 = Task("scores", "niilc_char_f1", "NIILC", TaskType.QA)
@@ -85,19 +85,19 @@ class Tasks(Enum):
85
  )
86
  wikicorpus_e_to_j_bleu_ja = Task("scores", "wikicorpus-e-to-j_bleu_ja", "WikiCorpus E to J BLEU", TaskType.MT)
87
  wikicorpus_e_to_j_comet_wmt22 = Task(
88
- "scores", "wikicorpus-e-to-j_comet_wmt22", "WikiCorpus E to J COMET WMT22", TaskType.MT
89
  )
90
  wikicorpus_j_to_e_bert_score_en_f1 = Task(
91
  "scores", "wikicorpus-j-to-e_bert_score_en_f1", "WikiCorpus J to E BERT Score", TaskType.MT
92
  )
93
  wikicorpus_j_to_e_bleu_en = Task("scores", "wikicorpus-j-to-e_bleu_en", "WikiCorpus J to E BLEU", TaskType.MT)
94
  wikicorpus_j_to_e_comet_wmt22 = Task(
95
- "scores", "wikicorpus-j-to-e_comet_wmt22", "WikiCorpus J to E COMET WMT22", TaskType.MT
96
  )
97
  xlsum_ja_bert_score_ja_f1 = Task("scores", "xlsum_ja_bert_score_ja_f1", "XL-Sum JA BERT Score", TaskType.SUM)
98
  xlsum_ja_bleu_ja = Task("scores", "xlsum_ja_bleu_ja", "XL-Sum JA BLEU", TaskType.SUM)
99
  xlsum_ja_rouge1 = Task("scores", "xlsum_ja_rouge1", "XL-Sum ROUGE1", TaskType.SUM)
100
- xlsum_ja_rouge2 = Task("scores", "xlsum_ja_rouge2", "XL-Sum ROUGE2", TaskType.SUM)
101
  # xlsum_ja_rouge2_scaling = Task("scores", "xlsum_ja_rouge2_scaling", "XL-Sum JA ROUGE2 Scaling")
102
  xlsum_ja_rougeLsum = Task("scores", "xlsum_ja_rougeLsum", "XL-Sum ROUGE-Lsum", TaskType.SUM)
103
 
@@ -220,6 +220,17 @@ This task is supported by llm-jp-eval, but it is not included in the evaluation
220
  ## Reproducibility
221
  To reproduce our results, please follow the instructions of the evalution tool, **llm-jp-eval** available in [Japanese](https://github.com/llm-jp/llm-jp-eval/blob/main/README.md) and in [English](https://github.com/llm-jp/llm-jp-eval/blob/main/README_en.md).
222
 
 
 
 
 
 
 
 
 
 
 
 
223
  """
224
 
225
  LLM_BENCHMARKS_TEXT_JA = """
@@ -302,6 +313,17 @@ LLM_BENCHMARKS_TEXT_JA = """
302
 
303
  ## 再現性
304
  我々の結果を再現するには、評価ツール **llm-jp-eval** の指示に従ってください。詳細は [日本語](https://github.com/llm-jp/llm-jp-eval/blob/main/README.md) と [英語](https://github.com/llm-jp/llm-jp-eval/blob/main/README_en.md) でご覧いただけます。
 
 
 
 
 
 
 
 
 
 
 
305
  """
306
 
307
 
 
45
  SUM = Task("scores", "SUM", "AVG (SUM)", TaskType.SUM, True) # Summarization - 要約
46
  alt_e_to_j_bert_score_ja_f1 = Task("scores", "alt-e-to-j_bert_score_ja_f1", "ALT E to J BERT Score", TaskType.MT)
47
  alt_e_to_j_bleu_ja = Task("scores", "alt-e-to-j_bleu_ja", "ALT E to J BLEU", TaskType.MT)
48
+ alt_e_to_j_comet_wmt22 = Task("scores", "alt-e-to-j_comet_wmt22", "ALT E to J COMET WMT22", TaskType.MT)
49
  alt_j_to_e_bert_score_en_f1 = Task("scores", "alt-j-to-e_bert_score_en_f1", "ALT J to E BERT Score", TaskType.MT)
50
  alt_j_to_e_bleu_en = Task("scores", "alt-j-to-e_bleu_en", "ALT J to E BLEU", TaskType.MT)
51
+ alt_j_to_e_comet_wmt22 = Task("scores", "alt-j-to-e_comet_wmt22", "ALT J to E COMET WMT22", TaskType.MT)
52
+ chabsa_set_f1 = Task("scores", "chabsa_set_f1", "ChABSA", TaskType.EL)
53
  commonsensemoralja_exact_match = Task(
54
  "scores", "commonsensemoralja_exact_match", "CommonSenseMoralJA", TaskType.MC
55
  )
 
61
  jnli_exact_match = Task("scores", "jnli_exact_match", "JNLI", TaskType.NLI)
62
  jsem_exact_match = Task("scores", "jsem_exact_match", "JSEM", TaskType.NLI)
63
  jsick_exact_match = Task("scores", "jsick_exact_match", "JSICK", TaskType.NLI)
64
+ jsquad_char_f1 = Task("scores", "jsquad_char_f1", "JSquad", TaskType.RC)
65
  jsts_pearson = Task(
66
  "scores", "jsts_pearson", "JSTS (Pearson)", TaskType.STS
67
  ) # Semantic Textual Similarity - 意味的類似度
 
69
  "scores", "jsts_spearman", "JSTS (Spearman)", TaskType.STS
70
  ) # Semantic Textual Similarity - 意味的類似度
71
  kuci_exact_match = Task("scores", "kuci_exact_match", "KUCI", TaskType.MC)
72
+ mawps_exact_match = Task("scores", "mawps_exact_match", "MAWPS", TaskType.MR)
73
+ mbpp_code_exec = Task("scores", "mbpp_code_exec", "MBPP (exec)", TaskType.CG)
74
  mbpp_pylint_check = Task("scores", "mbpp_pylint_check", "MBPP (pylint)", TaskType.CG)
75
  mmlu_en_exact_match = Task("scores", "mmlu_en_exact_match", "MMLU", TaskType.HE)
76
  niilc_char_f1 = Task("scores", "niilc_char_f1", "NIILC", TaskType.QA)
 
85
  )
86
  wikicorpus_e_to_j_bleu_ja = Task("scores", "wikicorpus-e-to-j_bleu_ja", "WikiCorpus E to J BLEU", TaskType.MT)
87
  wikicorpus_e_to_j_comet_wmt22 = Task(
88
+ "scores", "wikicorpus-e-to-j_comet_wmt22", "WikiCorpus E to J COMET WMT22", TaskType.MT
89
  )
90
  wikicorpus_j_to_e_bert_score_en_f1 = Task(
91
  "scores", "wikicorpus-j-to-e_bert_score_en_f1", "WikiCorpus J to E BERT Score", TaskType.MT
92
  )
93
  wikicorpus_j_to_e_bleu_en = Task("scores", "wikicorpus-j-to-e_bleu_en", "WikiCorpus J to E BLEU", TaskType.MT)
94
  wikicorpus_j_to_e_comet_wmt22 = Task(
95
+ "scores", "wikicorpus-j-to-e_comet_wmt22", "WikiCorpus J to E COMET WMT22", TaskType.MT
96
  )
97
  xlsum_ja_bert_score_ja_f1 = Task("scores", "xlsum_ja_bert_score_ja_f1", "XL-Sum JA BERT Score", TaskType.SUM)
98
  xlsum_ja_bleu_ja = Task("scores", "xlsum_ja_bleu_ja", "XL-Sum JA BLEU", TaskType.SUM)
99
  xlsum_ja_rouge1 = Task("scores", "xlsum_ja_rouge1", "XL-Sum ROUGE1", TaskType.SUM)
100
+ xlsum_ja_rouge2 = Task("scores", "xlsum_ja_rouge2", "XL-Sum ROUGE2", TaskType.SUM)
101
  # xlsum_ja_rouge2_scaling = Task("scores", "xlsum_ja_rouge2_scaling", "XL-Sum JA ROUGE2 Scaling")
102
  xlsum_ja_rougeLsum = Task("scores", "xlsum_ja_rougeLsum", "XL-Sum ROUGE-Lsum", TaskType.SUM)
103
 
 
220
  ## Reproducibility
221
  To reproduce our results, please follow the instructions of the evalution tool, **llm-jp-eval** available in [Japanese](https://github.com/llm-jp/llm-jp-eval/blob/main/README.md) and in [English](https://github.com/llm-jp/llm-jp-eval/blob/main/README_en.md).
222
 
223
+ ## Average Score Calculation
224
+ For the following task categories (RC, EL, MR, MT, CG, SUM), the tasks marked with ⭐ are included in the average calculation:
225
+
226
+ Tasks included in average calculation:
227
+ - RC: JSQuAD ⭐
228
+ - EL: ChABSA ⭐
229
+ - MR: MAWPS ⭐
230
+ - MT: ALT E to J COMET WMT22 ⭐, ALT J to E COMET WMT22 ⭐, WikiCorpus E to J COMET WMT22 ⭐, WikiCorpus J to E COMET WMT22 ⭐
231
+ - CG: MBPP (exec) ⭐
232
+ - SUM: XL-Sum ROUGE2 ⭐
233
+
234
  """
235
 
236
  LLM_BENCHMARKS_TEXT_JA = """
 
313
 
314
  ## 再現性
315
  我々の結果を再現するには、評価ツール **llm-jp-eval** の指示に従ってください。詳細は [日本語](https://github.com/llm-jp/llm-jp-eval/blob/main/README.md) と [英語](https://github.com/llm-jp/llm-jp-eval/blob/main/README_en.md) でご覧いただけます。
316
+
317
+ ## 平均スコアの計算について
318
+ 以下のタスクカテゴリー(RC、EL、MR、MT、CG、SUM)において、⭐マークの付いたタスクのみが平均値の計算に含まれます:
319
+
320
+ 平均値計算に含まれるタスク:
321
+ - RC:JSQuAD ⭐
322
+ - EL:ChABSA ⭐
323
+ - MR:MAWPS ⭐
324
+ - MT:ALT E to J COMET WMT22 ⭐、ALT J to E COMET WMT22 ⭐、WikiCorpus E to J COMET WMT22 ⭐、WikiCorpus J to E COMET WMT22 ⭐
325
+ - CG:MBPP (exec) ⭐
326
+ - SUM:XL-Sum ROUGE2 ⭐
327
  """
328
 
329