Spaces:

llm-jp
/

open-japanese-llm-leaderboard

Running on CPU Upgrade

App Files Files Community

sh1gechan commited on Nov 19, 2024

Commit

a85bc80

1 Parent(s): 39a5dc9

AVGに関する説明を追加

Browse files

Files changed (1) hide show

src/about.py +31 -9

src/about.py CHANGED Viewed

@@ -45,11 +45,11 @@ class Tasks(Enum):
     SUM = Task("scores", "SUM", "AVG (SUM)", TaskType.SUM, True)  # Summarization - 要約
     alt_e_to_j_bert_score_ja_f1 = Task("scores", "alt-e-to-j_bert_score_ja_f1", "ALT E to J BERT Score", TaskType.MT)
     alt_e_to_j_bleu_ja = Task("scores", "alt-e-to-j_bleu_ja", "ALT E to J BLEU", TaskType.MT)
-    alt_e_to_j_comet_wmt22 = Task("scores", "alt-e-to-j_comet_wmt22", "ALT E to J COMET WMT22", TaskType.MT)
     alt_j_to_e_bert_score_en_f1 = Task("scores", "alt-j-to-e_bert_score_en_f1", "ALT J to E BERT Score", TaskType.MT)
     alt_j_to_e_bleu_en = Task("scores", "alt-j-to-e_bleu_en", "ALT J to E BLEU", TaskType.MT)
-    alt_j_to_e_comet_wmt22 = Task("scores", "alt-j-to-e_comet_wmt22", "ALT J to E COMET WMT22", TaskType.MT)
-    chabsa_set_f1 = Task("scores", "chabsa_set_f1", "ChABSA", TaskType.EL)
     commonsensemoralja_exact_match = Task(
         "scores", "commonsensemoralja_exact_match", "CommonSenseMoralJA", TaskType.MC
     )
@@ -61,7 +61,7 @@ class Tasks(Enum):
     jnli_exact_match = Task("scores", "jnli_exact_match", "JNLI", TaskType.NLI)
     jsem_exact_match = Task("scores", "jsem_exact_match", "JSEM", TaskType.NLI)
     jsick_exact_match = Task("scores", "jsick_exact_match", "JSICK", TaskType.NLI)
-    jsquad_char_f1 = Task("scores", "jsquad_char_f1", "JSquad", TaskType.RC)
     jsts_pearson = Task(
         "scores", "jsts_pearson", "JSTS (Pearson)", TaskType.STS
     )  # Semantic Textual Similarity - 意味的類似度
@@ -69,8 +69,8 @@ class Tasks(Enum):
         "scores", "jsts_spearman", "JSTS (Spearman)", TaskType.STS
     )  # Semantic Textual Similarity - 意味的類似度
     kuci_exact_match = Task("scores", "kuci_exact_match", "KUCI", TaskType.MC)
-    mawps_exact_match = Task("scores", "mawps_exact_match", "MAWPS", TaskType.MR)
-    mbpp_code_exec = Task("scores", "mbpp_code_exec", "MBPP (exec)", TaskType.CG)
     mbpp_pylint_check = Task("scores", "mbpp_pylint_check", "MBPP (pylint)", TaskType.CG)
     mmlu_en_exact_match = Task("scores", "mmlu_en_exact_match", "MMLU", TaskType.HE)
     niilc_char_f1 = Task("scores", "niilc_char_f1", "NIILC", TaskType.QA)
@@ -85,19 +85,19 @@ class Tasks(Enum):
     )
     wikicorpus_e_to_j_bleu_ja = Task("scores", "wikicorpus-e-to-j_bleu_ja", "WikiCorpus E to J BLEU", TaskType.MT)
     wikicorpus_e_to_j_comet_wmt22 = Task(
-        "scores", "wikicorpus-e-to-j_comet_wmt22", "WikiCorpus E to J COMET WMT22", TaskType.MT
     )
     wikicorpus_j_to_e_bert_score_en_f1 = Task(
         "scores", "wikicorpus-j-to-e_bert_score_en_f1", "WikiCorpus J to E BERT Score", TaskType.MT
     )
     wikicorpus_j_to_e_bleu_en = Task("scores", "wikicorpus-j-to-e_bleu_en", "WikiCorpus J to E BLEU", TaskType.MT)
     wikicorpus_j_to_e_comet_wmt22 = Task(
-        "scores", "wikicorpus-j-to-e_comet_wmt22", "WikiCorpus J to E COMET WMT22", TaskType.MT
     )
     xlsum_ja_bert_score_ja_f1 = Task("scores", "xlsum_ja_bert_score_ja_f1", "XL-Sum JA BERT Score", TaskType.SUM)
     xlsum_ja_bleu_ja = Task("scores", "xlsum_ja_bleu_ja", "XL-Sum JA BLEU", TaskType.SUM)
     xlsum_ja_rouge1 = Task("scores", "xlsum_ja_rouge1", "XL-Sum ROUGE1", TaskType.SUM)
-    xlsum_ja_rouge2 = Task("scores", "xlsum_ja_rouge2", "XL-Sum ROUGE2", TaskType.SUM)
     # xlsum_ja_rouge2_scaling = Task("scores", "xlsum_ja_rouge2_scaling", "XL-Sum JA ROUGE2 Scaling")
     xlsum_ja_rougeLsum = Task("scores", "xlsum_ja_rougeLsum", "XL-Sum ROUGE-Lsum", TaskType.SUM)
@@ -220,6 +220,17 @@ This task is supported by llm-jp-eval, but it is not included in the evaluation
 ## Reproducibility
 To reproduce our results, please follow the instructions of the evalution tool, **llm-jp-eval** available in [Japanese](https://github.com/llm-jp/llm-jp-eval/blob/main/README.md) and in [English](https://github.com/llm-jp/llm-jp-eval/blob/main/README_en.md).
 """
 LLM_BENCHMARKS_TEXT_JA = """
@@ -302,6 +313,17 @@ LLM_BENCHMARKS_TEXT_JA = """
 ## 再現性
 我々の結果を再現するには、評価ツール **llm-jp-eval** の指示に従ってください。詳細は [日本語](https://github.com/llm-jp/llm-jp-eval/blob/main/README.md) と [英語](https://github.com/llm-jp/llm-jp-eval/blob/main/README_en.md) でご覧いただけます。
 """

     SUM = Task("scores", "SUM", "AVG (SUM)", TaskType.SUM, True)  # Summarization - 要約
     alt_e_to_j_bert_score_ja_f1 = Task("scores", "alt-e-to-j_bert_score_ja_f1", "ALT E to J BERT Score", TaskType.MT)
     alt_e_to_j_bleu_ja = Task("scores", "alt-e-to-j_bleu_ja", "ALT E to J BLEU", TaskType.MT)
+    alt_e_to_j_comet_wmt22 = Task("scores", "alt-e-to-j_comet_wmt22", "ALT E to J COMET WMT22 ⭐", TaskType.MT)
     alt_j_to_e_bert_score_en_f1 = Task("scores", "alt-j-to-e_bert_score_en_f1", "ALT J to E BERT Score", TaskType.MT)
     alt_j_to_e_bleu_en = Task("scores", "alt-j-to-e_bleu_en", "ALT J to E BLEU", TaskType.MT)
+    alt_j_to_e_comet_wmt22 = Task("scores", "alt-j-to-e_comet_wmt22", "ALT J to E COMET WMT22 ⭐", TaskType.MT)
+    chabsa_set_f1 = Task("scores", "chabsa_set_f1", "ChABSA ⭐", TaskType.EL)
     commonsensemoralja_exact_match = Task(
         "scores", "commonsensemoralja_exact_match", "CommonSenseMoralJA", TaskType.MC
     )
     jnli_exact_match = Task("scores", "jnli_exact_match", "JNLI", TaskType.NLI)
     jsem_exact_match = Task("scores", "jsem_exact_match", "JSEM", TaskType.NLI)
     jsick_exact_match = Task("scores", "jsick_exact_match", "JSICK", TaskType.NLI)
+    jsquad_char_f1 = Task("scores", "jsquad_char_f1", "JSquad ⭐", TaskType.RC)
     jsts_pearson = Task(
         "scores", "jsts_pearson", "JSTS (Pearson)", TaskType.STS
     )  # Semantic Textual Similarity - 意味的類似度
         "scores", "jsts_spearman", "JSTS (Spearman)", TaskType.STS
     )  # Semantic Textual Similarity - 意味的類似度
     kuci_exact_match = Task("scores", "kuci_exact_match", "KUCI", TaskType.MC)
+    mawps_exact_match = Task("scores", "mawps_exact_match", "MAWPS ⭐", TaskType.MR)
+    mbpp_code_exec = Task("scores", "mbpp_code_exec", "MBPP (exec) ⭐", TaskType.CG)
     mbpp_pylint_check = Task("scores", "mbpp_pylint_check", "MBPP (pylint)", TaskType.CG)
     mmlu_en_exact_match = Task("scores", "mmlu_en_exact_match", "MMLU", TaskType.HE)
     niilc_char_f1 = Task("scores", "niilc_char_f1", "NIILC", TaskType.QA)
     )
     wikicorpus_e_to_j_bleu_ja = Task("scores", "wikicorpus-e-to-j_bleu_ja", "WikiCorpus E to J BLEU", TaskType.MT)
     wikicorpus_e_to_j_comet_wmt22 = Task(
+        "scores", "wikicorpus-e-to-j_comet_wmt22", "WikiCorpus E to J COMET WMT22 ⭐", TaskType.MT
     )
     wikicorpus_j_to_e_bert_score_en_f1 = Task(
         "scores", "wikicorpus-j-to-e_bert_score_en_f1", "WikiCorpus J to E BERT Score", TaskType.MT
     )
     wikicorpus_j_to_e_bleu_en = Task("scores", "wikicorpus-j-to-e_bleu_en", "WikiCorpus J to E BLEU", TaskType.MT)
     wikicorpus_j_to_e_comet_wmt22 = Task(
+        "scores", "wikicorpus-j-to-e_comet_wmt22", "WikiCorpus J to E COMET WMT22 ⭐", TaskType.MT
     )
     xlsum_ja_bert_score_ja_f1 = Task("scores", "xlsum_ja_bert_score_ja_f1", "XL-Sum JA BERT Score", TaskType.SUM)
     xlsum_ja_bleu_ja = Task("scores", "xlsum_ja_bleu_ja", "XL-Sum JA BLEU", TaskType.SUM)
     xlsum_ja_rouge1 = Task("scores", "xlsum_ja_rouge1", "XL-Sum ROUGE1", TaskType.SUM)
+    xlsum_ja_rouge2 = Task("scores", "xlsum_ja_rouge2", "XL-Sum ROUGE2 ⭐", TaskType.SUM)
     # xlsum_ja_rouge2_scaling = Task("scores", "xlsum_ja_rouge2_scaling", "XL-Sum JA ROUGE2 Scaling")
     xlsum_ja_rougeLsum = Task("scores", "xlsum_ja_rougeLsum", "XL-Sum ROUGE-Lsum", TaskType.SUM)
 ## Reproducibility
 To reproduce our results, please follow the instructions of the evalution tool, **llm-jp-eval** available in [Japanese](https://github.com/llm-jp/llm-jp-eval/blob/main/README.md) and in [English](https://github.com/llm-jp/llm-jp-eval/blob/main/README_en.md).
+## Average Score Calculation
+For the following task categories (RC, EL, MR, MT, CG, SUM), the tasks marked with ⭐ are included in the average calculation:
+Tasks included in average calculation:
+- RC: JSQuAD ⭐
+- EL: ChABSA ⭐
+- MR: MAWPS ⭐
+- MT: ALT E to J COMET WMT22 ⭐, ALT J to E COMET WMT22 ⭐, WikiCorpus E to J COMET WMT22 ⭐, WikiCorpus J to E COMET WMT22 ⭐
+- CG: MBPP (exec) ⭐
+- SUM: XL-Sum ROUGE2 ⭐
 """
 LLM_BENCHMARKS_TEXT_JA = """
 ## 再現性
 我々の結果を再現するには、評価ツール **llm-jp-eval** の指示に従ってください。詳細は [日本語](https://github.com/llm-jp/llm-jp-eval/blob/main/README.md) と [英語](https://github.com/llm-jp/llm-jp-eval/blob/main/README_en.md) でご覧いただけます。
+## 平均スコアの計算について
+以下のタスクカテゴリー（RC、EL、MR、MT、CG、SUM）において、⭐マークの付いたタスクのみが平均値の計算に含まれます：
+平均値計算に含まれるタスク：
+- RC：JSQuAD ⭐
+- EL：ChABSA ⭐
+- MR：MAWPS ⭐
+- MT：ALT E to J COMET WMT22 ⭐、ALT J to E COMET WMT22 ⭐、WikiCorpus E to J COMET WMT22 ⭐、WikiCorpus J to E COMET WMT22 ⭐
+- CG：MBPP (exec) ⭐
+- SUM：XL-Sum ROUGE2 ⭐
 """