sh1gechan commited on
Commit
300792f
·
1 Parent(s): be318e9

説明の修正

Browse files
Files changed (1) hide show
  1. src/about.py +23 -39
src/about.py CHANGED
@@ -51,16 +51,16 @@ class Tasks(Enum):
51
  alt_j_to_e_comet_wmt22 = Task("scores", "alt-j-to-e_comet_wmt22", "ALT J to E COMET WMT22 ⭐", TaskType.MT)
52
  chabsa_set_f1 = Task("scores", "chabsa_set_f1", "ChABSA ⭐", TaskType.EL)
53
  commonsensemoralja_exact_match = Task(
54
- "scores", "commonsensemoralja_exact_match", "CommonSenseMoralJA", TaskType.MC
55
  )
56
- jamp_exact_match = Task("scores", "jamp_exact_match", "JAMP", TaskType.NLI)
57
- janli_exact_match = Task("scores", "janli_exact_match", "JANLI", TaskType.NLI)
58
- jcommonsenseqa_exact_match = Task("scores", "jcommonsenseqa_exact_match", "JCommonSenseQA", TaskType.MC)
59
- jemhopqa_char_f1 = Task("scores", "jemhopqa_char_f1", "JEMHopQA", TaskType.QA)
60
- jmmlu_exact_match = Task("scores", "jmmlu_exact_match", "JMMLU", TaskType.HE)
61
- jnli_exact_match = Task("scores", "jnli_exact_match", "JNLI", TaskType.NLI)
62
- jsem_exact_match = Task("scores", "jsem_exact_match", "JSEM", TaskType.NLI)
63
- jsick_exact_match = Task("scores", "jsick_exact_match", "JSICK", TaskType.NLI)
64
  jsquad_char_f1 = Task("scores", "jsquad_char_f1", "JSquad ⭐", TaskType.RC)
65
  jsts_pearson = Task(
66
  "scores", "jsts_pearson", "JSTS (Pearson)", TaskType.STS
@@ -68,18 +68,18 @@ class Tasks(Enum):
68
  jsts_spearman = Task(
69
  "scores", "jsts_spearman", "JSTS (Spearman)", TaskType.STS
70
  ) # Semantic Textual Similarity - 意味的類似度
71
- kuci_exact_match = Task("scores", "kuci_exact_match", "KUCI", TaskType.MC)
72
  mawps_exact_match = Task("scores", "mawps_exact_match", "MAWPS ⭐", TaskType.MR)
73
  mbpp_code_exec = Task("scores", "mbpp_code_exec", "MBPP (exec) ⭐", TaskType.CG)
74
  mbpp_pylint_check = Task("scores", "mbpp_pylint_check", "MBPP (pylint)", TaskType.CG)
75
- mmlu_en_exact_match = Task("scores", "mmlu_en_exact_match", "MMLU", TaskType.HE)
76
- niilc_char_f1 = Task("scores", "niilc_char_f1", "NIILC", TaskType.QA)
77
- aio_char_f1 = Task("scores", "aio_char_f1", "JAQKET", TaskType.QA)
78
- wiki_coreference_set_f1 = Task("scores", "wiki_coreference_set_f1", "Wiki Coreference", TaskType.FA)
79
- wiki_dependency_set_f1 = Task("scores", "wiki_dependency_set_f1", "Wiki Dependency", TaskType.FA)
80
- wiki_ner_set_f1 = Task("scores", "wiki_ner_set_f1", "Wiki NER", TaskType.FA)
81
- wiki_pas_set_f1 = Task("scores", "wiki_pas_set_f1", "Wiki PAS", TaskType.FA)
82
- wiki_reading_char_f1 = Task("scores", "wiki_reading_char_f1", "Wiki Reading", TaskType.FA)
83
  wikicorpus_e_to_j_bert_score_ja_f1 = Task(
84
  "scores", "wikicorpus-e-to-j_bert_score_ja_f1", "WikiCorpus E to J BERT Score", TaskType.MT
85
  )
@@ -221,21 +221,13 @@ This task is supported by llm-jp-eval, but it is not included in the evaluation
221
  To reproduce our results, please follow the instructions of the evalution tool, **llm-jp-eval** available in [Japanese](https://github.com/llm-jp/llm-jp-eval/blob/main/README.md) and in [English](https://github.com/llm-jp/llm-jp-eval/blob/main/README_en.md).
222
 
223
  ## Average Score Calculation
224
- For the following task categories (RC, EL, MR, MT, CG, SUM), the tasks marked with are included in the average calculation:
225
-
226
- Tasks included in average calculation:
227
- - RC: JSQuAD ⭐
228
- - EL: ChABSA ⭐
229
- - MR: MAWPS ⭐
230
- - MT: ALT E to J COMET WMT22 ⭐, ALT J to E COMET WMT22 ⭐, WikiCorpus E to J COMET WMT22 ⭐, WikiCorpus J to E COMET WMT22 ⭐
231
- - CG: MBPP (exec) ⭐
232
- - SUM: XL-Sum ROUGE2 ⭐
233
 
234
  """
235
 
236
  LLM_BENCHMARKS_TEXT_JA = """
237
  ## 仕組み
238
- 📈 我々は評価ツール [llm-jp-eval](https://github.com/llm-jp/llm-jp-eval) を活用し、16種類のタスクで日本語の大規模言語モデルを評価します。このツールは、様々な評価タスクで日本語LLMを評価するための統一的なフレームワークです。
239
 
240
  **NLI(自然言語推論)**
241
 
@@ -293,7 +285,7 @@ LLM_BENCHMARKS_TEXT_JA = """
293
 
294
  **STS(意味的テキスト類似度)**
295
 
296
- このタスクはllm-jp-evalでサポートされていますが、評価スコアの平均には含まれていません。
297
 
298
  * `JSTS`、STS(Semantic Textual Similarity)の日本語版(JGLUEの一部)[ソース](https://github.com/yahoojapan/JGLUE)(ライセンス CC BY-SA 4.0)
299
 
@@ -312,18 +304,10 @@ LLM_BENCHMARKS_TEXT_JA = """
312
  * `XL-Sum`、44言語の大規模多言語抽象型要約データセットの日本語部分 [ソース](https://github.com/csebuetnlp/xl-sum)(ライセンス CC BY-NC-SA 4.0、非商用ライセンスのため、このデータセットは使用しません。ライセンスと利用規約に明確に同意した場合を除きます)
313
 
314
  ## 再現性
315
- 我々の結果を再現するには、評価ツール **llm-jp-eval** の指示に従ってください。詳細は [日本語](https://github.com/llm-jp/llm-jp-eval/blob/main/README.md) と [英語](https://github.com/llm-jp/llm-jp-eval/blob/main/README_en.md) でご覧いただけます。
316
 
317
  ## 平均スコアの計算について
318
- 以下のタスクカテゴリー(RC、EL、MR、MT、CG、SUM)において、⭐マークの付いたタスクのみが平均値の計算に含まれます:
319
-
320
- 平均値計算に含まれるタスク:
321
- - RC:JSQuAD ⭐
322
- - EL:ChABSA ⭐
323
- - MR:MAWPS ⭐
324
- - MT:ALT E to J COMET WMT22 ⭐、ALT J to E COMET WMT22 ⭐、WikiCorpus E to J COMET WMT22 ⭐、WikiCorpus J to E COMET WMT22 ⭐
325
- - CG:MBPP (exec) ⭐
326
- - SUM:XL-Sum ROUGE2 ⭐
327
  """
328
 
329
 
 
51
  alt_j_to_e_comet_wmt22 = Task("scores", "alt-j-to-e_comet_wmt22", "ALT J to E COMET WMT22 ⭐", TaskType.MT)
52
  chabsa_set_f1 = Task("scores", "chabsa_set_f1", "ChABSA ⭐", TaskType.EL)
53
  commonsensemoralja_exact_match = Task(
54
+ "scores", "commonsensemoralja_exact_match", "CommonSenseMoralJA", TaskType.MC
55
  )
56
+ jamp_exact_match = Task("scores", "jamp_exact_match", "JAMP", TaskType.NLI)
57
+ janli_exact_match = Task("scores", "janli_exact_match", "JANLI", TaskType.NLI)
58
+ jcommonsenseqa_exact_match = Task("scores", "jcommonsenseqa_exact_match", "JCommonSenseQA", TaskType.MC)
59
+ jemhopqa_char_f1 = Task("scores", "jemhopqa_char_f1", "JEMHopQA", TaskType.QA)
60
+ jmmlu_exact_match = Task("scores", "jmmlu_exact_match", "JMMLU", TaskType.HE)
61
+ jnli_exact_match = Task("scores", "jnli_exact_match", "JNLI", TaskType.NLI)
62
+ jsem_exact_match = Task("scores", "jsem_exact_match", "JSEM", TaskType.NLI)
63
+ jsick_exact_match = Task("scores", "jsick_exact_match", "JSICK", TaskType.NLI)
64
  jsquad_char_f1 = Task("scores", "jsquad_char_f1", "JSquad ⭐", TaskType.RC)
65
  jsts_pearson = Task(
66
  "scores", "jsts_pearson", "JSTS (Pearson)", TaskType.STS
 
68
  jsts_spearman = Task(
69
  "scores", "jsts_spearman", "JSTS (Spearman)", TaskType.STS
70
  ) # Semantic Textual Similarity - 意味的類似度
71
+ kuci_exact_match = Task("scores", "kuci_exact_match", "KUCI", TaskType.MC)
72
  mawps_exact_match = Task("scores", "mawps_exact_match", "MAWPS ⭐", TaskType.MR)
73
  mbpp_code_exec = Task("scores", "mbpp_code_exec", "MBPP (exec) ⭐", TaskType.CG)
74
  mbpp_pylint_check = Task("scores", "mbpp_pylint_check", "MBPP (pylint)", TaskType.CG)
75
+ mmlu_en_exact_match = Task("scores", "mmlu_en_exact_match", "MMLU", TaskType.HE)
76
+ niilc_char_f1 = Task("scores", "niilc_char_f1", "NIILC", TaskType.QA)
77
+ aio_char_f1 = Task("scores", "aio_char_f1", "JAQKET", TaskType.QA)
78
+ wiki_coreference_set_f1 = Task("scores", "wiki_coreference_set_f1", "Wiki Coreference", TaskType.FA)
79
+ wiki_dependency_set_f1 = Task("scores", "wiki_dependency_set_f1", "Wiki Dependency", TaskType.FA)
80
+ wiki_ner_set_f1 = Task("scores", "wiki_ner_set_f1", "Wiki NER", TaskType.FA)
81
+ wiki_pas_set_f1 = Task("scores", "wiki_pas_set_f1", "Wiki PAS", TaskType.FA)
82
+ wiki_reading_char_f1 = Task("scores", "wiki_reading_char_f1", "Wiki Reading", TaskType.FA)
83
  wikicorpus_e_to_j_bert_score_ja_f1 = Task(
84
  "scores", "wikicorpus-e-to-j_bert_score_ja_f1", "WikiCorpus E to J BERT Score", TaskType.MT
85
  )
 
221
  To reproduce our results, please follow the instructions of the evalution tool, **llm-jp-eval** available in [Japanese](https://github.com/llm-jp/llm-jp-eval/blob/main/README.md) and in [English](https://github.com/llm-jp/llm-jp-eval/blob/main/README_en.md).
222
 
223
  ## Average Score Calculation
224
+ The calculation of the average score (AVG) includes only the scores marked with a ⭐.
 
 
 
 
 
 
 
 
225
 
226
  """
227
 
228
  LLM_BENCHMARKS_TEXT_JA = """
229
  ## 仕組み
230
+ 📈 評価ツール [llm-jp-eval](https://github.com/llm-jp/llm-jp-eval) を活用し、16種類のタスクで日本語の大規模言語モデルを評価します。このツールは、様々な評価タスクで日本語LLMを評価するための統一的なフレームワークです。
231
 
232
  **NLI(自然言語推論)**
233
 
 
285
 
286
  **STS(意味的テキスト類似度)**
287
 
288
+ このタスクはllm-jp-evalでサポートされていますが、平均スコア (AVG) の計算には含まれていません。
289
 
290
  * `JSTS`、STS(Semantic Textual Similarity)の日本語版(JGLUEの一部)[ソース](https://github.com/yahoojapan/JGLUE)(ライセンス CC BY-SA 4.0)
291
 
 
304
  * `XL-Sum`、44言語の大規模多言語抽象型要約データセットの日本語部分 [ソース](https://github.com/csebuetnlp/xl-sum)(ライセンス CC BY-NC-SA 4.0、非商用ライセンスのため、このデータセットは使用しません。ライセンスと利用規約に明確に同意した場合を除きます)
305
 
306
  ## 再現性
307
+ 結果を再現するには、評価ツール **llm-jp-eval** の指示に従ってください。詳細は [日本語](https://github.com/llm-jp/llm-jp-eval/blob/main/README.md) と [英語](https://github.com/llm-jp/llm-jp-eval/blob/main/README_en.md) でご覧いただけます。
308
 
309
  ## 平均スコアの計算について
310
+ 平均スコア (AVG) の計算には、⭐マークのついたスコアのみが含まれます
 
 
 
 
 
 
 
 
311
  """
312
 
313