Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
AVGに関する説明を追加
Browse files- src/about.py +31 -9
src/about.py
CHANGED
@@ -45,11 +45,11 @@ class Tasks(Enum):
|
|
45 |
SUM = Task("scores", "SUM", "AVG (SUM)", TaskType.SUM, True) # Summarization - 要約
|
46 |
alt_e_to_j_bert_score_ja_f1 = Task("scores", "alt-e-to-j_bert_score_ja_f1", "ALT E to J BERT Score", TaskType.MT)
|
47 |
alt_e_to_j_bleu_ja = Task("scores", "alt-e-to-j_bleu_ja", "ALT E to J BLEU", TaskType.MT)
|
48 |
-
alt_e_to_j_comet_wmt22 = Task("scores", "alt-e-to-j_comet_wmt22", "ALT E to J COMET WMT22", TaskType.MT)
|
49 |
alt_j_to_e_bert_score_en_f1 = Task("scores", "alt-j-to-e_bert_score_en_f1", "ALT J to E BERT Score", TaskType.MT)
|
50 |
alt_j_to_e_bleu_en = Task("scores", "alt-j-to-e_bleu_en", "ALT J to E BLEU", TaskType.MT)
|
51 |
-
alt_j_to_e_comet_wmt22 = Task("scores", "alt-j-to-e_comet_wmt22", "ALT J to E COMET WMT22", TaskType.MT)
|
52 |
-
chabsa_set_f1 = Task("scores", "chabsa_set_f1", "ChABSA", TaskType.EL)
|
53 |
commonsensemoralja_exact_match = Task(
|
54 |
"scores", "commonsensemoralja_exact_match", "CommonSenseMoralJA", TaskType.MC
|
55 |
)
|
@@ -61,7 +61,7 @@ class Tasks(Enum):
|
|
61 |
jnli_exact_match = Task("scores", "jnli_exact_match", "JNLI", TaskType.NLI)
|
62 |
jsem_exact_match = Task("scores", "jsem_exact_match", "JSEM", TaskType.NLI)
|
63 |
jsick_exact_match = Task("scores", "jsick_exact_match", "JSICK", TaskType.NLI)
|
64 |
-
jsquad_char_f1 = Task("scores", "jsquad_char_f1", "JSquad", TaskType.RC)
|
65 |
jsts_pearson = Task(
|
66 |
"scores", "jsts_pearson", "JSTS (Pearson)", TaskType.STS
|
67 |
) # Semantic Textual Similarity - 意味的類似度
|
@@ -69,8 +69,8 @@ class Tasks(Enum):
|
|
69 |
"scores", "jsts_spearman", "JSTS (Spearman)", TaskType.STS
|
70 |
) # Semantic Textual Similarity - 意味的類似度
|
71 |
kuci_exact_match = Task("scores", "kuci_exact_match", "KUCI", TaskType.MC)
|
72 |
-
mawps_exact_match = Task("scores", "mawps_exact_match", "MAWPS", TaskType.MR)
|
73 |
-
mbpp_code_exec = Task("scores", "mbpp_code_exec", "MBPP (exec)", TaskType.CG)
|
74 |
mbpp_pylint_check = Task("scores", "mbpp_pylint_check", "MBPP (pylint)", TaskType.CG)
|
75 |
mmlu_en_exact_match = Task("scores", "mmlu_en_exact_match", "MMLU", TaskType.HE)
|
76 |
niilc_char_f1 = Task("scores", "niilc_char_f1", "NIILC", TaskType.QA)
|
@@ -85,19 +85,19 @@ class Tasks(Enum):
|
|
85 |
)
|
86 |
wikicorpus_e_to_j_bleu_ja = Task("scores", "wikicorpus-e-to-j_bleu_ja", "WikiCorpus E to J BLEU", TaskType.MT)
|
87 |
wikicorpus_e_to_j_comet_wmt22 = Task(
|
88 |
-
"scores", "wikicorpus-e-to-j_comet_wmt22", "WikiCorpus E to J COMET WMT22", TaskType.MT
|
89 |
)
|
90 |
wikicorpus_j_to_e_bert_score_en_f1 = Task(
|
91 |
"scores", "wikicorpus-j-to-e_bert_score_en_f1", "WikiCorpus J to E BERT Score", TaskType.MT
|
92 |
)
|
93 |
wikicorpus_j_to_e_bleu_en = Task("scores", "wikicorpus-j-to-e_bleu_en", "WikiCorpus J to E BLEU", TaskType.MT)
|
94 |
wikicorpus_j_to_e_comet_wmt22 = Task(
|
95 |
-
"scores", "wikicorpus-j-to-e_comet_wmt22", "WikiCorpus J to E COMET WMT22", TaskType.MT
|
96 |
)
|
97 |
xlsum_ja_bert_score_ja_f1 = Task("scores", "xlsum_ja_bert_score_ja_f1", "XL-Sum JA BERT Score", TaskType.SUM)
|
98 |
xlsum_ja_bleu_ja = Task("scores", "xlsum_ja_bleu_ja", "XL-Sum JA BLEU", TaskType.SUM)
|
99 |
xlsum_ja_rouge1 = Task("scores", "xlsum_ja_rouge1", "XL-Sum ROUGE1", TaskType.SUM)
|
100 |
-
xlsum_ja_rouge2 = Task("scores", "xlsum_ja_rouge2", "XL-Sum ROUGE2", TaskType.SUM)
|
101 |
# xlsum_ja_rouge2_scaling = Task("scores", "xlsum_ja_rouge2_scaling", "XL-Sum JA ROUGE2 Scaling")
|
102 |
xlsum_ja_rougeLsum = Task("scores", "xlsum_ja_rougeLsum", "XL-Sum ROUGE-Lsum", TaskType.SUM)
|
103 |
|
@@ -220,6 +220,17 @@ This task is supported by llm-jp-eval, but it is not included in the evaluation
|
|
220 |
## Reproducibility
|
221 |
To reproduce our results, please follow the instructions of the evalution tool, **llm-jp-eval** available in [Japanese](https://github.com/llm-jp/llm-jp-eval/blob/main/README.md) and in [English](https://github.com/llm-jp/llm-jp-eval/blob/main/README_en.md).
|
222 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
223 |
"""
|
224 |
|
225 |
LLM_BENCHMARKS_TEXT_JA = """
|
@@ -302,6 +313,17 @@ LLM_BENCHMARKS_TEXT_JA = """
|
|
302 |
|
303 |
## 再現性
|
304 |
我々の結果を再現するには、評価ツール **llm-jp-eval** の指示に従ってください。詳細は [日本語](https://github.com/llm-jp/llm-jp-eval/blob/main/README.md) と [英語](https://github.com/llm-jp/llm-jp-eval/blob/main/README_en.md) でご覧いただけます。
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
305 |
"""
|
306 |
|
307 |
|
|
|
45 |
SUM = Task("scores", "SUM", "AVG (SUM)", TaskType.SUM, True) # Summarization - 要約
|
46 |
alt_e_to_j_bert_score_ja_f1 = Task("scores", "alt-e-to-j_bert_score_ja_f1", "ALT E to J BERT Score", TaskType.MT)
|
47 |
alt_e_to_j_bleu_ja = Task("scores", "alt-e-to-j_bleu_ja", "ALT E to J BLEU", TaskType.MT)
|
48 |
+
alt_e_to_j_comet_wmt22 = Task("scores", "alt-e-to-j_comet_wmt22", "ALT E to J COMET WMT22 ⭐", TaskType.MT)
|
49 |
alt_j_to_e_bert_score_en_f1 = Task("scores", "alt-j-to-e_bert_score_en_f1", "ALT J to E BERT Score", TaskType.MT)
|
50 |
alt_j_to_e_bleu_en = Task("scores", "alt-j-to-e_bleu_en", "ALT J to E BLEU", TaskType.MT)
|
51 |
+
alt_j_to_e_comet_wmt22 = Task("scores", "alt-j-to-e_comet_wmt22", "ALT J to E COMET WMT22 ⭐", TaskType.MT)
|
52 |
+
chabsa_set_f1 = Task("scores", "chabsa_set_f1", "ChABSA ⭐", TaskType.EL)
|
53 |
commonsensemoralja_exact_match = Task(
|
54 |
"scores", "commonsensemoralja_exact_match", "CommonSenseMoralJA", TaskType.MC
|
55 |
)
|
|
|
61 |
jnli_exact_match = Task("scores", "jnli_exact_match", "JNLI", TaskType.NLI)
|
62 |
jsem_exact_match = Task("scores", "jsem_exact_match", "JSEM", TaskType.NLI)
|
63 |
jsick_exact_match = Task("scores", "jsick_exact_match", "JSICK", TaskType.NLI)
|
64 |
+
jsquad_char_f1 = Task("scores", "jsquad_char_f1", "JSquad ⭐", TaskType.RC)
|
65 |
jsts_pearson = Task(
|
66 |
"scores", "jsts_pearson", "JSTS (Pearson)", TaskType.STS
|
67 |
) # Semantic Textual Similarity - 意味的類似度
|
|
|
69 |
"scores", "jsts_spearman", "JSTS (Spearman)", TaskType.STS
|
70 |
) # Semantic Textual Similarity - 意味的類似度
|
71 |
kuci_exact_match = Task("scores", "kuci_exact_match", "KUCI", TaskType.MC)
|
72 |
+
mawps_exact_match = Task("scores", "mawps_exact_match", "MAWPS ⭐", TaskType.MR)
|
73 |
+
mbpp_code_exec = Task("scores", "mbpp_code_exec", "MBPP (exec) ⭐", TaskType.CG)
|
74 |
mbpp_pylint_check = Task("scores", "mbpp_pylint_check", "MBPP (pylint)", TaskType.CG)
|
75 |
mmlu_en_exact_match = Task("scores", "mmlu_en_exact_match", "MMLU", TaskType.HE)
|
76 |
niilc_char_f1 = Task("scores", "niilc_char_f1", "NIILC", TaskType.QA)
|
|
|
85 |
)
|
86 |
wikicorpus_e_to_j_bleu_ja = Task("scores", "wikicorpus-e-to-j_bleu_ja", "WikiCorpus E to J BLEU", TaskType.MT)
|
87 |
wikicorpus_e_to_j_comet_wmt22 = Task(
|
88 |
+
"scores", "wikicorpus-e-to-j_comet_wmt22", "WikiCorpus E to J COMET WMT22 ⭐", TaskType.MT
|
89 |
)
|
90 |
wikicorpus_j_to_e_bert_score_en_f1 = Task(
|
91 |
"scores", "wikicorpus-j-to-e_bert_score_en_f1", "WikiCorpus J to E BERT Score", TaskType.MT
|
92 |
)
|
93 |
wikicorpus_j_to_e_bleu_en = Task("scores", "wikicorpus-j-to-e_bleu_en", "WikiCorpus J to E BLEU", TaskType.MT)
|
94 |
wikicorpus_j_to_e_comet_wmt22 = Task(
|
95 |
+
"scores", "wikicorpus-j-to-e_comet_wmt22", "WikiCorpus J to E COMET WMT22 ⭐", TaskType.MT
|
96 |
)
|
97 |
xlsum_ja_bert_score_ja_f1 = Task("scores", "xlsum_ja_bert_score_ja_f1", "XL-Sum JA BERT Score", TaskType.SUM)
|
98 |
xlsum_ja_bleu_ja = Task("scores", "xlsum_ja_bleu_ja", "XL-Sum JA BLEU", TaskType.SUM)
|
99 |
xlsum_ja_rouge1 = Task("scores", "xlsum_ja_rouge1", "XL-Sum ROUGE1", TaskType.SUM)
|
100 |
+
xlsum_ja_rouge2 = Task("scores", "xlsum_ja_rouge2", "XL-Sum ROUGE2 ⭐", TaskType.SUM)
|
101 |
# xlsum_ja_rouge2_scaling = Task("scores", "xlsum_ja_rouge2_scaling", "XL-Sum JA ROUGE2 Scaling")
|
102 |
xlsum_ja_rougeLsum = Task("scores", "xlsum_ja_rougeLsum", "XL-Sum ROUGE-Lsum", TaskType.SUM)
|
103 |
|
|
|
220 |
## Reproducibility
|
221 |
To reproduce our results, please follow the instructions of the evalution tool, **llm-jp-eval** available in [Japanese](https://github.com/llm-jp/llm-jp-eval/blob/main/README.md) and in [English](https://github.com/llm-jp/llm-jp-eval/blob/main/README_en.md).
|
222 |
|
223 |
+
## Average Score Calculation
|
224 |
+
For the following task categories (RC, EL, MR, MT, CG, SUM), the tasks marked with ⭐ are included in the average calculation:
|
225 |
+
|
226 |
+
Tasks included in average calculation:
|
227 |
+
- RC: JSQuAD ⭐
|
228 |
+
- EL: ChABSA ⭐
|
229 |
+
- MR: MAWPS ⭐
|
230 |
+
- MT: ALT E to J COMET WMT22 ⭐, ALT J to E COMET WMT22 ⭐, WikiCorpus E to J COMET WMT22 ⭐, WikiCorpus J to E COMET WMT22 ⭐
|
231 |
+
- CG: MBPP (exec) ⭐
|
232 |
+
- SUM: XL-Sum ROUGE2 ⭐
|
233 |
+
|
234 |
"""
|
235 |
|
236 |
LLM_BENCHMARKS_TEXT_JA = """
|
|
|
313 |
|
314 |
## 再現性
|
315 |
我々の結果を再現するには、評価ツール **llm-jp-eval** の指示に従ってください。詳細は [日本語](https://github.com/llm-jp/llm-jp-eval/blob/main/README.md) と [英語](https://github.com/llm-jp/llm-jp-eval/blob/main/README_en.md) でご覧いただけます。
|
316 |
+
|
317 |
+
## 平均スコアの計算について
|
318 |
+
以下のタスクカテゴリー(RC、EL、MR、MT、CG、SUM)において、⭐マークの付いたタスクのみが平均値の計算に含まれます:
|
319 |
+
|
320 |
+
平均値計算に含まれるタスク:
|
321 |
+
- RC:JSQuAD ⭐
|
322 |
+
- EL:ChABSA ⭐
|
323 |
+
- MR:MAWPS ⭐
|
324 |
+
- MT:ALT E to J COMET WMT22 ⭐、ALT J to E COMET WMT22 ⭐、WikiCorpus E to J COMET WMT22 ⭐、WikiCorpus J to E COMET WMT22 ⭐
|
325 |
+
- CG:MBPP (exec) ⭐
|
326 |
+
- SUM:XL-Sum ROUGE2 ⭐
|
327 |
"""
|
328 |
|
329 |
|