Spaces:
Running
Running
Upload folder using huggingface_hub
Browse files- results/cross_lingual/zero_shot/cross_logiqa.csv +1 -0
- results/cross_lingual/zero_shot/cross_logiqa_no_prompt.csv +1 -0
- results/cross_lingual/zero_shot/cross_mmlu.csv +1 -0
- results/cross_lingual/zero_shot/cross_mmlu_no_prompt.csv +1 -0
- results/cross_lingual/zero_shot/cross_xquad_no_prompt.csv +1 -0
- results/cultural_reasoning/zero_shot/cn_eval.csv +1 -0
- results/cultural_reasoning/zero_shot/ph_eval.csv +1 -0
- results/cultural_reasoning/zero_shot/sg_eval.csv +1 -0
- results/cultural_reasoning/zero_shot/sg_eval_v1_cleaned.csv +1 -0
- results/cultural_reasoning/zero_shot/sg_eval_v2_mcq.csv +1 -0
- results/cultural_reasoning/zero_shot/sg_eval_v2_mcq_no_prompt.csv +1 -0
- results/cultural_reasoning/zero_shot/sg_eval_v2_open.csv +1 -0
- results/cultural_reasoning/zero_shot/us_eval.csv +1 -0
- results/dialogue/zero_shot/dialogsum.csv +1 -0
- results/dialogue/zero_shot/dream.csv +1 -0
- results/dialogue/zero_shot/samsum.csv +1 -0
- results/emotion/zero_shot/ind_emotion.csv +1 -0
- results/emotion/zero_shot/sst2.csv +1 -0
- results/flores_translation/zero_shot/ind2eng.csv +1 -0
- results/flores_translation/zero_shot/vie2eng.csv +1 -0
- results/flores_translation/zero_shot/zho2eng.csv +1 -0
- results/flores_translation/zero_shot/zsm2eng.csv +1 -0
- results/fundamental_nlp_tasks/zero_shot/cola.csv +1 -0
- results/fundamental_nlp_tasks/zero_shot/mnli.csv +1 -0
- results/fundamental_nlp_tasks/zero_shot/mrpc.csv +1 -0
- results/fundamental_nlp_tasks/zero_shot/ocnli.csv +1 -0
- results/fundamental_nlp_tasks/zero_shot/qnli.csv +1 -0
- results/fundamental_nlp_tasks/zero_shot/qqp.csv +1 -0
- results/fundamental_nlp_tasks/zero_shot/rte.csv +1 -0
- results/fundamental_nlp_tasks/zero_shot/wnli.csv +1 -0
- results/general_reasoning/zero_shot/c_eval.csv +1 -0
- results/general_reasoning/zero_shot/cmmlu.csv +1 -0
- results/general_reasoning/zero_shot/mmlu.csv +1 -0
- results/general_reasoning/zero_shot/zbench.csv +1 -0
results/cross_lingual/zero_shot/cross_logiqa.csv
CHANGED
@@ -6,6 +6,7 @@ Qwen2_5_32B_Instruct,0.6931818181818182,0.6397727272727273,0.6654072695772988,0.
|
|
6 |
Qwen2_5_7B_Instruct,0.599025974025974,0.5034090909090908,0.5470709896292291,0.7102272727272727,0.7215909090909091,0.6136363636363636,0.6022727272727273,0.5738636363636364,0.5511363636363636,0.42045454545454547
|
7 |
Qwen2_5_1_5B_Instruct,0.46834415584415584,0.348538961038961,0.3996561615557665,0.5511363636363636,0.5909090909090909,0.4659090909090909,0.5113636363636364,0.4375,0.375,0.3465909090909091
|
8 |
Qwen2-72B-Instruct,0.6728896103896104,0.6762987012987012,0.6745898487968579,0.75,0.8068181818181818,0.6534090909090909,0.6193181818181818,0.625,0.6534090909090909,0.6022727272727273
|
|
|
9 |
Meta-Llama-3-8B-Instruct,0.4610389610389611,0.45097402597402597,0.4559509553669637,0.5965909090909091,0.48295454545454547,0.5,0.4602272727272727,0.42045454545454547,0.4034090909090909,0.36363636363636365
|
10 |
merged_llama3_8b_sg_inst_avg_diff,0.4829545454545454,0.4952922077922078,0.48904557518459746,0.5397727272727273,0.5340909090909091,0.4772727272727273,0.5056818181818182,0.4602272727272727,0.45454545454545453,0.4090909090909091
|
11 |
Meta-Llama-3.1-70B-Instruct,0.6566558441558442,0.598051948051948,0.6259852839118454,0.7443181818181818,0.7215909090909091,0.6647727272727273,0.6534090909090909,0.6193181818181818,0.625,0.5681818181818182
|
|
|
6 |
Qwen2_5_7B_Instruct,0.599025974025974,0.5034090909090908,0.5470709896292291,0.7102272727272727,0.7215909090909091,0.6136363636363636,0.6022727272727273,0.5738636363636364,0.5511363636363636,0.42045454545454547
|
7 |
Qwen2_5_1_5B_Instruct,0.46834415584415584,0.348538961038961,0.3996561615557665,0.5511363636363636,0.5909090909090909,0.4659090909090909,0.5113636363636364,0.4375,0.375,0.3465909090909091
|
8 |
Qwen2-72B-Instruct,0.6728896103896104,0.6762987012987012,0.6745898487968579,0.75,0.8068181818181818,0.6534090909090909,0.6193181818181818,0.625,0.6534090909090909,0.6022727272727273
|
9 |
+
Sailor2-8B-Chat,0.5405844155844156,0.5628246753246753,0.551480408610067,0.625,0.5852272727272727,0.4943181818181818,0.5568181818181818,0.5056818181818182,0.5568181818181818,0.4602272727272727
|
10 |
Meta-Llama-3-8B-Instruct,0.4610389610389611,0.45097402597402597,0.4559509553669637,0.5965909090909091,0.48295454545454547,0.5,0.4602272727272727,0.42045454545454547,0.4034090909090909,0.36363636363636365
|
11 |
merged_llama3_8b_sg_inst_avg_diff,0.4829545454545454,0.4952922077922078,0.48904557518459746,0.5397727272727273,0.5340909090909091,0.4772727272727273,0.5056818181818182,0.4602272727272727,0.45454545454545453,0.4090909090909091
|
12 |
Meta-Llama-3.1-70B-Instruct,0.6566558441558442,0.598051948051948,0.6259852839118454,0.7443181818181818,0.7215909090909091,0.6647727272727273,0.6534090909090909,0.6193181818181818,0.625,0.5681818181818182
|
results/cross_lingual/zero_shot/cross_logiqa_no_prompt.csv
CHANGED
@@ -2,6 +2,7 @@ Model,Accuracy,Cross-Lingual Consistency,AC3,English,Chinese,Spanish,Vietnamese,
|
|
2 |
Meta-Llama-3.1-8B-Instruct,0.512987012987013,0.4394480519480519,0.4733785048611023,0.5852272727272727,0.5852272727272727,0.5454545454545454,0.5,0.45454545454545453,0.5227272727272727,0.3977272727272727
|
3 |
llama3-8b-cpt-sea-lionv2.1-instruct,0.45779220779220786,0.3751623376623376,0.412378792469608,0.5284090909090909,0.5170454545454546,0.5340909090909091,0.4602272727272727,0.4034090909090909,0.4431818181818182,0.3181818181818182
|
4 |
Qwen2_5_7B_Instruct,0.6047077922077922,0.47938311688311697,0.5348014705675028,0.6931818181818182,0.7102272727272727,0.6420454545454546,0.5795454545454546,0.6306818181818182,0.5340909090909091,0.4431818181818182
|
|
|
5 |
merged_llama3_8b_sg_inst_avg_diff,0.5105519480519481,0.4558441558441559,0.48164954476113636,0.5909090909090909,0.5284090909090909,0.5454545454545454,0.5,0.4943181818181818,0.48863636363636365,0.42613636363636365
|
6 |
SeaLLMs-v3-7B-Chat,0.5324675324675324,0.41266233766233773,0.46497164802104307,0.5681818181818182,0.5852272727272727,0.5738636363636364,0.5568181818181818,0.4943181818181818,0.5170454545454546,0.4318181818181818
|
7 |
gemma-2-9b-it,0.6006493506493508,0.5753246753246755,0.587714328691409,0.6590909090909091,0.6363636363636364,0.5511363636363636,0.6022727272727273,0.5852272727272727,0.6022727272727273,0.5681818181818182
|
|
|
2 |
Meta-Llama-3.1-8B-Instruct,0.512987012987013,0.4394480519480519,0.4733785048611023,0.5852272727272727,0.5852272727272727,0.5454545454545454,0.5,0.45454545454545453,0.5227272727272727,0.3977272727272727
|
3 |
llama3-8b-cpt-sea-lionv2.1-instruct,0.45779220779220786,0.3751623376623376,0.412378792469608,0.5284090909090909,0.5170454545454546,0.5340909090909091,0.4602272727272727,0.4034090909090909,0.4431818181818182,0.3181818181818182
|
4 |
Qwen2_5_7B_Instruct,0.6047077922077922,0.47938311688311697,0.5348014705675028,0.6931818181818182,0.7102272727272727,0.6420454545454546,0.5795454545454546,0.6306818181818182,0.5340909090909091,0.4431818181818182
|
5 |
+
Sailor2-8B-Chat,0.5503246753246753,0.5363636363636365,0.5432544747850031,0.6136363636363636,0.625,0.5056818181818182,0.5625,0.5113636363636364,0.5511363636363636,0.48295454545454547
|
6 |
merged_llama3_8b_sg_inst_avg_diff,0.5105519480519481,0.4558441558441559,0.48164954476113636,0.5909090909090909,0.5284090909090909,0.5454545454545454,0.5,0.4943181818181818,0.48863636363636365,0.42613636363636365
|
7 |
SeaLLMs-v3-7B-Chat,0.5324675324675324,0.41266233766233773,0.46497164802104307,0.5681818181818182,0.5852272727272727,0.5738636363636364,0.5568181818181818,0.4943181818181818,0.5170454545454546,0.4318181818181818
|
8 |
gemma-2-9b-it,0.6006493506493508,0.5753246753246755,0.587714328691409,0.6590909090909091,0.6363636363636364,0.5511363636363636,0.6022727272727273,0.5852272727272727,0.6022727272727273,0.5681818181818182
|
results/cross_lingual/zero_shot/cross_mmlu.csv
CHANGED
@@ -6,6 +6,7 @@ Qwen2_5_32B_Instruct,0.8019047619047619,0.7386666666666668,0.7689878008073214,0.
|
|
6 |
Qwen2_5_7B_Instruct,0.6733333333333332,0.580952380952381,0.6237408250578389,0.7666666666666667,0.7066666666666667,0.72,0.6666666666666666,0.6866666666666666,0.6266666666666667,0.54
|
7 |
Qwen2_5_1_5B_Instruct,0.5076190476190475,0.3721904761904762,0.42948154099799957,0.6,0.6066666666666667,0.5333333333333333,0.4866666666666667,0.5666666666666667,0.4,0.36
|
8 |
Qwen2-72B-Instruct,0.779047619047619,0.7611428571428573,0.7699911663398871,0.8133333333333334,0.7933333333333333,0.7933333333333333,0.7333333333333333,0.7666666666666667,0.78,0.7733333333333333
|
|
|
9 |
Meta-Llama-3-8B-Instruct,0.5733333333333334,0.4742857142857144,0.5191272726777197,0.7133333333333334,0.5866666666666667,0.5733333333333334,0.5866666666666667,0.5066666666666667,0.5333333333333333,0.5133333333333333
|
10 |
merged_llama3_8b_sg_inst_avg_diff,0.5980952380952381,0.5817142857142859,0.5897910419722433,0.76,0.5866666666666667,0.6266666666666667,0.5466666666666666,0.5666666666666667,0.5533333333333333,0.5466666666666666
|
11 |
Meta-Llama-3.1-70B-Instruct,0.7638095238095238,0.7716190476190474,0.7676944251955988,0.8,0.74,0.7666666666666667,0.7666666666666667,0.76,0.7666666666666667,0.7466666666666667
|
|
|
6 |
Qwen2_5_7B_Instruct,0.6733333333333332,0.580952380952381,0.6237408250578389,0.7666666666666667,0.7066666666666667,0.72,0.6666666666666666,0.6866666666666666,0.6266666666666667,0.54
|
7 |
Qwen2_5_1_5B_Instruct,0.5076190476190475,0.3721904761904762,0.42948154099799957,0.6,0.6066666666666667,0.5333333333333333,0.4866666666666667,0.5666666666666667,0.4,0.36
|
8 |
Qwen2-72B-Instruct,0.779047619047619,0.7611428571428573,0.7699911663398871,0.8133333333333334,0.7933333333333333,0.7933333333333333,0.7333333333333333,0.7666666666666667,0.78,0.7733333333333333
|
9 |
+
Sailor2-8B-Chat,0.6542857142857142,0.6586666666666667,0.6564688814239598,0.7133333333333334,0.6733333333333333,0.6533333333333333,0.6066666666666667,0.62,0.6466666666666666,0.6666666666666666
|
10 |
Meta-Llama-3-8B-Instruct,0.5733333333333334,0.4742857142857144,0.5191272726777197,0.7133333333333334,0.5866666666666667,0.5733333333333334,0.5866666666666667,0.5066666666666667,0.5333333333333333,0.5133333333333333
|
11 |
merged_llama3_8b_sg_inst_avg_diff,0.5980952380952381,0.5817142857142859,0.5897910419722433,0.76,0.5866666666666667,0.6266666666666667,0.5466666666666666,0.5666666666666667,0.5533333333333333,0.5466666666666666
|
12 |
Meta-Llama-3.1-70B-Instruct,0.7638095238095238,0.7716190476190474,0.7676944251955988,0.8,0.74,0.7666666666666667,0.7666666666666667,0.76,0.7666666666666667,0.7466666666666667
|
results/cross_lingual/zero_shot/cross_mmlu_no_prompt.csv
CHANGED
@@ -2,6 +2,7 @@ Model,Accuracy,Cross-Lingual Consistency,AC3,English,Chinese,Spanish,Vietnamese,
|
|
2 |
Meta-Llama-3.1-8B-Instruct,0.6876190476190477,0.5615238095238096,0.6182070607559236,0.82,0.6333333333333333,0.72,0.6666666666666666,0.66,0.6466666666666666,0.6666666666666666
|
3 |
llama3-8b-cpt-sea-lionv2.1-instruct,0.6676190476190476,0.5590476190476189,0.6085285418019147,0.7533333333333333,0.6666666666666666,0.68,0.6333333333333333,0.6933333333333334,0.64,0.6066666666666667
|
4 |
Qwen2_5_7B_Instruct,0.7742857142857141,0.6222857142857142,0.6900140284752591,0.8466666666666667,0.84,0.8266666666666667,0.74,0.7533333333333333,0.7133333333333334,0.7
|
|
|
5 |
Meta-Llama-3-8B-Instruct,0.6428571428571429,0.49542857142857133,0.5595955249078094,0.7666666666666667,0.6533333333333333,0.7,0.6466666666666666,0.5733333333333334,0.5733333333333334,0.5866666666666667
|
6 |
merged_llama3_8b_sg_inst_avg_diff,0.6980952380952381,0.5891428571428572,0.6390081595918414,0.8466666666666667,0.6933333333333334,0.6933333333333334,0.6933333333333334,0.7133333333333334,0.6133333333333333,0.6333333333333333
|
7 |
SeaLLMs-v3-7B-Chat,0.7342857142857142,0.5765714285714287,0.6459409639562039,0.8333333333333334,0.7266666666666667,0.7866666666666666,0.7133333333333334,0.74,0.6866666666666666,0.6533333333333333
|
|
|
2 |
Meta-Llama-3.1-8B-Instruct,0.6876190476190477,0.5615238095238096,0.6182070607559236,0.82,0.6333333333333333,0.72,0.6666666666666666,0.66,0.6466666666666666,0.6666666666666666
|
3 |
llama3-8b-cpt-sea-lionv2.1-instruct,0.6676190476190476,0.5590476190476189,0.6085285418019147,0.7533333333333333,0.6666666666666666,0.68,0.6333333333333333,0.6933333333333334,0.64,0.6066666666666667
|
4 |
Qwen2_5_7B_Instruct,0.7742857142857141,0.6222857142857142,0.6900140284752591,0.8466666666666667,0.84,0.8266666666666667,0.74,0.7533333333333333,0.7133333333333334,0.7
|
5 |
+
Sailor2-8B-Chat,0.6923809523809524,0.6592380952380954,0.6754031781322388,0.7266666666666667,0.7066666666666667,0.7133333333333334,0.6733333333333333,0.6733333333333333,0.6466666666666666,0.7066666666666667
|
6 |
Meta-Llama-3-8B-Instruct,0.6428571428571429,0.49542857142857133,0.5595955249078094,0.7666666666666667,0.6533333333333333,0.7,0.6466666666666666,0.5733333333333334,0.5733333333333334,0.5866666666666667
|
7 |
merged_llama3_8b_sg_inst_avg_diff,0.6980952380952381,0.5891428571428572,0.6390081595918414,0.8466666666666667,0.6933333333333334,0.6933333333333334,0.6933333333333334,0.7133333333333334,0.6133333333333333,0.6333333333333333
|
8 |
SeaLLMs-v3-7B-Chat,0.7342857142857142,0.5765714285714287,0.6459409639562039,0.8333333333333334,0.7266666666666667,0.7866666666666666,0.7133333333333334,0.74,0.6866666666666666,0.6533333333333333
|
results/cross_lingual/zero_shot/cross_xquad_no_prompt.csv
CHANGED
@@ -2,6 +2,7 @@ Model,Accuracy,Cross-Lingual Consistency,AC3,English,Chinese,Spanish,Vietnamese,
|
|
2 |
Meta-Llama-3.1-8B-Instruct,0.9168067226890756,0.8292016806722688,0.870806433460842,0.9436974789915966,0.8949579831932774,0.9201680672268907,0.9084033613445378,,,
|
3 |
llama3-8b-cpt-sea-lionv2.1-instruct,0.928781512605042,0.8592436974789917,0.892660412722869,0.9470588235294117,0.9084033613445378,0.9352941176470588,0.9243697478991597,,,
|
4 |
Qwen2_5_7B_Instruct,0.9069327731092437,0.8264705882352941,0.8648342089942876,0.9210084033613445,0.8991596638655462,0.9092436974789916,0.8983193277310925,,,
|
|
|
5 |
Meta-Llama-3-8B-Instruct,0.9060924369747899,0.8224789915966386,0.8622634639161603,0.9319327731092437,0.8932773109243698,0.9134453781512605,0.8857142857142857,,,
|
6 |
merged_llama3_8b_sg_inst_avg_diff,0.9117647058823529,0.8266806722689075,0.8671405721911006,0.9302521008403362,0.8899159663865546,0.9210084033613445,0.9058823529411765,,,
|
7 |
SeaLLMs-v3-7B-Chat,0.8943277310924369,0.7991596638655463,0.8440696412045011,0.9210084033613445,0.8773109243697479,0.9,0.8789915966386554,,,
|
|
|
2 |
Meta-Llama-3.1-8B-Instruct,0.9168067226890756,0.8292016806722688,0.870806433460842,0.9436974789915966,0.8949579831932774,0.9201680672268907,0.9084033613445378,,,
|
3 |
llama3-8b-cpt-sea-lionv2.1-instruct,0.928781512605042,0.8592436974789917,0.892660412722869,0.9470588235294117,0.9084033613445378,0.9352941176470588,0.9243697478991597,,,
|
4 |
Qwen2_5_7B_Instruct,0.9069327731092437,0.8264705882352941,0.8648342089942876,0.9210084033613445,0.8991596638655462,0.9092436974789916,0.8983193277310925,,,
|
5 |
+
Sailor2-8B-Chat,0.9086134453781513,0.8378151260504201,0.8717792421413649,0.9252100840336135,0.8949579831932774,0.9117647058823529,0.9025210084033614,,,
|
6 |
Meta-Llama-3-8B-Instruct,0.9060924369747899,0.8224789915966386,0.8622634639161603,0.9319327731092437,0.8932773109243698,0.9134453781512605,0.8857142857142857,,,
|
7 |
merged_llama3_8b_sg_inst_avg_diff,0.9117647058823529,0.8266806722689075,0.8671405721911006,0.9302521008403362,0.8899159663865546,0.9210084033613445,0.9058823529411765,,,
|
8 |
SeaLLMs-v3-7B-Chat,0.8943277310924369,0.7991596638655463,0.8440696412045011,0.9210084033613445,0.8773109243697479,0.9,0.8789915966386554,,,
|
results/cultural_reasoning/zero_shot/cn_eval.csv
CHANGED
@@ -6,6 +6,7 @@ Qwen2_5_32B_Instruct,0.8476190476190476
|
|
6 |
Qwen2_5_7B_Instruct,0.8
|
7 |
Qwen2_5_1_5B_Instruct,0.5523809523809524
|
8 |
Qwen2-72B-Instruct,0.8285714285714286
|
|
|
9 |
Meta-Llama-3-8B-Instruct,0.4666666666666667
|
10 |
merged_llama3_8b_sg_inst_avg_diff,0.5142857142857142
|
11 |
Meta-Llama-3.1-70B-Instruct,0.5428571428571428
|
|
|
6 |
Qwen2_5_7B_Instruct,0.8
|
7 |
Qwen2_5_1_5B_Instruct,0.5523809523809524
|
8 |
Qwen2-72B-Instruct,0.8285714285714286
|
9 |
+
Sailor2-8B-Chat,0.7142857142857143
|
10 |
Meta-Llama-3-8B-Instruct,0.4666666666666667
|
11 |
merged_llama3_8b_sg_inst_avg_diff,0.5142857142857142
|
12 |
Meta-Llama-3.1-70B-Instruct,0.5428571428571428
|
results/cultural_reasoning/zero_shot/ph_eval.csv
CHANGED
@@ -6,6 +6,7 @@ Qwen2_5_32B_Instruct,0.7
|
|
6 |
Qwen2_5_7B_Instruct,0.55
|
7 |
Qwen2_5_1_5B_Instruct,0.37
|
8 |
Qwen2-72B-Instruct,0.62
|
|
|
9 |
Meta-Llama-3-8B-Instruct,0.58
|
10 |
merged_llama3_8b_sg_inst_avg_diff,0.54
|
11 |
Meta-Llama-3.1-70B-Instruct,0.68
|
|
|
6 |
Qwen2_5_7B_Instruct,0.55
|
7 |
Qwen2_5_1_5B_Instruct,0.37
|
8 |
Qwen2-72B-Instruct,0.62
|
9 |
+
Sailor2-8B-Chat,0.53
|
10 |
Meta-Llama-3-8B-Instruct,0.58
|
11 |
merged_llama3_8b_sg_inst_avg_diff,0.54
|
12 |
Meta-Llama-3.1-70B-Instruct,0.68
|
results/cultural_reasoning/zero_shot/sg_eval.csv
CHANGED
@@ -6,6 +6,7 @@ Qwen2_5_32B_Instruct,0.7184466019417476
|
|
6 |
Qwen2_5_7B_Instruct,0.6699029126213593
|
7 |
Qwen2_5_1_5B_Instruct,0.5048543689320388
|
8 |
Qwen2-72B-Instruct,0.7378640776699029
|
|
|
9 |
Meta-Llama-3-8B-Instruct,0.6504854368932039
|
10 |
merged_llama3_8b_sg_inst_avg_diff,0.6019417475728155
|
11 |
Meta-Llama-3.1-70B-Instruct,0.7184466019417476
|
|
|
6 |
Qwen2_5_7B_Instruct,0.6699029126213593
|
7 |
Qwen2_5_1_5B_Instruct,0.5048543689320388
|
8 |
Qwen2-72B-Instruct,0.7378640776699029
|
9 |
+
Sailor2-8B-Chat,0.6019417475728155
|
10 |
Meta-Llama-3-8B-Instruct,0.6504854368932039
|
11 |
merged_llama3_8b_sg_inst_avg_diff,0.6019417475728155
|
12 |
Meta-Llama-3.1-70B-Instruct,0.7184466019417476
|
results/cultural_reasoning/zero_shot/sg_eval_v1_cleaned.csv
CHANGED
@@ -6,6 +6,7 @@ Qwen2_5_32B_Instruct,0.6470588235294118
|
|
6 |
Qwen2_5_7B_Instruct,0.5882352941176471
|
7 |
Qwen2_5_1_5B_Instruct,0.47058823529411764
|
8 |
Qwen2-72B-Instruct,0.6764705882352942
|
|
|
9 |
Meta-Llama-3-8B-Instruct,0.5882352941176471
|
10 |
merged_llama3_8b_sg_inst_avg_diff,0.5441176470588235
|
11 |
Meta-Llama-3.1-70B-Instruct,0.6617647058823529
|
|
|
6 |
Qwen2_5_7B_Instruct,0.5882352941176471
|
7 |
Qwen2_5_1_5B_Instruct,0.47058823529411764
|
8 |
Qwen2-72B-Instruct,0.6764705882352942
|
9 |
+
Sailor2-8B-Chat,0.5735294117647058
|
10 |
Meta-Llama-3-8B-Instruct,0.5882352941176471
|
11 |
merged_llama3_8b_sg_inst_avg_diff,0.5441176470588235
|
12 |
Meta-Llama-3.1-70B-Instruct,0.6617647058823529
|
results/cultural_reasoning/zero_shot/sg_eval_v2_mcq.csv
CHANGED
@@ -6,6 +6,7 @@ Qwen2_5_32B_Instruct,0.8436363636363636
|
|
6 |
Qwen2_5_7B_Instruct,0.78
|
7 |
Qwen2_5_1_5B_Instruct,0.6636363636363637
|
8 |
Qwen2-72B-Instruct,0.8581818181818182
|
|
|
9 |
Meta-Llama-3-8B-Instruct,0.7909090909090909
|
10 |
merged_llama3_8b_sg_inst_avg_diff,0.8109090909090909
|
11 |
Meta-Llama-3.1-70B-Instruct,0.8763636363636363
|
|
|
6 |
Qwen2_5_7B_Instruct,0.78
|
7 |
Qwen2_5_1_5B_Instruct,0.6636363636363637
|
8 |
Qwen2-72B-Instruct,0.8581818181818182
|
9 |
+
Sailor2-8B-Chat,0.730909090909091
|
10 |
Meta-Llama-3-8B-Instruct,0.7909090909090909
|
11 |
merged_llama3_8b_sg_inst_avg_diff,0.8109090909090909
|
12 |
Meta-Llama-3.1-70B-Instruct,0.8763636363636363
|
results/cultural_reasoning/zero_shot/sg_eval_v2_mcq_no_prompt.csv
CHANGED
@@ -2,6 +2,7 @@ Model,Accuracy
|
|
2 |
Meta-Llama-3.1-8B-Instruct,0.7418181818181818
|
3 |
llama3-8b-cpt-sea-lionv2.1-instruct,0.7945454545454546
|
4 |
Qwen2_5_7B_Instruct,0.7654545454545455
|
|
|
5 |
Meta-Llama-3-8B-Instruct,0.8054545454545454
|
6 |
merged_llama3_8b_sg_inst_avg_diff,0.7854545454545454
|
7 |
SeaLLMs-v3-7B-Chat,0.7581818181818182
|
|
|
2 |
Meta-Llama-3.1-8B-Instruct,0.7418181818181818
|
3 |
llama3-8b-cpt-sea-lionv2.1-instruct,0.7945454545454546
|
4 |
Qwen2_5_7B_Instruct,0.7654545454545455
|
5 |
+
Sailor2-8B-Chat,0.7145454545454546
|
6 |
Meta-Llama-3-8B-Instruct,0.8054545454545454
|
7 |
merged_llama3_8b_sg_inst_avg_diff,0.7854545454545454
|
8 |
SeaLLMs-v3-7B-Chat,0.7581818181818182
|
results/cultural_reasoning/zero_shot/sg_eval_v2_open.csv
CHANGED
@@ -6,6 +6,7 @@ Qwen2_5_32B_Instruct,53.2
|
|
6 |
Qwen2_5_7B_Instruct,50.279999999999994
|
7 |
Qwen2_5_1_5B_Instruct,44.480000000000004
|
8 |
Qwen2-72B-Instruct,54.080000000000005
|
|
|
9 |
Meta-Llama-3-8B-Instruct,51.120000000000005
|
10 |
merged_llama3_8b_sg_inst_avg_diff,49.2
|
11 |
Meta-Llama-3.1-70B-Instruct,51.31999999999999
|
|
|
6 |
Qwen2_5_7B_Instruct,50.279999999999994
|
7 |
Qwen2_5_1_5B_Instruct,44.480000000000004
|
8 |
Qwen2-72B-Instruct,54.080000000000005
|
9 |
+
Sailor2-8B-Chat,54.36
|
10 |
Meta-Llama-3-8B-Instruct,51.120000000000005
|
11 |
merged_llama3_8b_sg_inst_avg_diff,49.2
|
12 |
Meta-Llama-3.1-70B-Instruct,51.31999999999999
|
results/cultural_reasoning/zero_shot/us_eval.csv
CHANGED
@@ -6,6 +6,7 @@ Qwen2_5_32B_Instruct,0.8411214953271028
|
|
6 |
Qwen2_5_7B_Instruct,0.7663551401869159
|
7 |
Qwen2_5_1_5B_Instruct,0.5981308411214953
|
8 |
Qwen2-72B-Instruct,0.8785046728971962
|
|
|
9 |
Meta-Llama-3-8B-Instruct,0.7009345794392523
|
10 |
merged_llama3_8b_sg_inst_avg_diff,0.7383177570093458
|
11 |
Meta-Llama-3.1-70B-Instruct,0.8411214953271028
|
|
|
6 |
Qwen2_5_7B_Instruct,0.7663551401869159
|
7 |
Qwen2_5_1_5B_Instruct,0.5981308411214953
|
8 |
Qwen2-72B-Instruct,0.8785046728971962
|
9 |
+
Sailor2-8B-Chat,0.7009345794392523
|
10 |
Meta-Llama-3-8B-Instruct,0.7009345794392523
|
11 |
merged_llama3_8b_sg_inst_avg_diff,0.7383177570093458
|
12 |
Meta-Llama-3.1-70B-Instruct,0.8411214953271028
|
results/dialogue/zero_shot/dialogsum.csv
CHANGED
@@ -6,6 +6,7 @@ Qwen2_5_32B_Instruct,0.2393912015484827,0.3451081398022419,0.11160543395371676,0
|
|
6 |
Qwen2_5_7B_Instruct,0.2502928721533066,0.35566069744050016,0.12210269253668227,0.27311522648273734
|
7 |
Qwen2_5_1_5B_Instruct,0.20263242988485167,0.30002072253966694,0.08416670238558713,0.22370986472930096
|
8 |
Qwen2-72B-Instruct,0.2183280630214023,0.316174552903144,0.10156543495268992,0.23724420120837297
|
|
|
9 |
Meta-Llama-3-8B-Instruct,0.23978455271183616,0.33971099717559883,0.1203340311564728,0.2593086298034369
|
10 |
merged_llama3_8b_sg_inst_avg_diff,0.25236243090492,0.3573462392196718,0.125506438977953,0.27423461451713527
|
11 |
Meta-Llama-3.1-70B-Instruct,0.2526239717396146,0.35714386898604744,0.1258832921736473,0.27484475405914904
|
|
|
6 |
Qwen2_5_7B_Instruct,0.2502928721533066,0.35566069744050016,0.12210269253668227,0.27311522648273734
|
7 |
Qwen2_5_1_5B_Instruct,0.20263242988485167,0.30002072253966694,0.08416670238558713,0.22370986472930096
|
8 |
Qwen2-72B-Instruct,0.2183280630214023,0.316174552903144,0.10156543495268992,0.23724420120837297
|
9 |
+
Sailor2-8B-Chat,0.19777087324327317,0.2970393044008424,0.07701994204737679,0.21925337328160027
|
10 |
Meta-Llama-3-8B-Instruct,0.23978455271183616,0.33971099717559883,0.1203340311564728,0.2593086298034369
|
11 |
merged_llama3_8b_sg_inst_avg_diff,0.25236243090492,0.3573462392196718,0.125506438977953,0.27423461451713527
|
12 |
Meta-Llama-3.1-70B-Instruct,0.2526239717396146,0.35714386898604744,0.1258832921736473,0.27484475405914904
|
results/dialogue/zero_shot/dream.csv
CHANGED
@@ -6,6 +6,7 @@ Qwen2_5_32B_Instruct,0.9559039686428221
|
|
6 |
Qwen2_5_7B_Instruct,0.9348358647721705
|
7 |
Qwen2_5_1_5B_Instruct,0.8314551690347869
|
8 |
Qwen2-72B-Instruct,0.9612934835864773
|
|
|
9 |
Meta-Llama-3-8B-Instruct,0.8946594806467418
|
10 |
merged_llama3_8b_sg_inst_avg_diff,0.9103380695737384
|
11 |
Meta-Llama-3.1-70B-Instruct,0.9559039686428221
|
|
|
6 |
Qwen2_5_7B_Instruct,0.9348358647721705
|
7 |
Qwen2_5_1_5B_Instruct,0.8314551690347869
|
8 |
Qwen2-72B-Instruct,0.9612934835864773
|
9 |
+
Sailor2-8B-Chat,0.9054385105340519
|
10 |
Meta-Llama-3-8B-Instruct,0.8946594806467418
|
11 |
merged_llama3_8b_sg_inst_avg_diff,0.9103380695737384
|
12 |
Meta-Llama-3.1-70B-Instruct,0.9559039686428221
|
results/dialogue/zero_shot/samsum.csv
CHANGED
@@ -6,6 +6,7 @@ Qwen2_5_32B_Instruct,0.2844232627209405,0.3986263552639068,0.14766658533002341,0
|
|
6 |
Qwen2_5_7B_Instruct,0.2987576845890178,0.4163299367235864,0.1599063413842216,0.32003677565924527
|
7 |
Qwen2_5_1_5B_Instruct,0.2333120091694482,0.34339111721032756,0.10195887716459845,0.25458603313341865
|
8 |
Qwen2-72B-Instruct,0.2800906719573321,0.3887231369098802,0.15237661526996754,0.29917226369214855
|
|
|
9 |
Meta-Llama-3-8B-Instruct,0.2846315092346869,0.39397110152251813,0.154320846916639,0.30560257926490364
|
10 |
merged_llama3_8b_sg_inst_avg_diff,0.2827552959388026,0.3953429193664384,0.14797005050571224,0.30495291794425716
|
11 |
Meta-Llama-3.1-70B-Instruct,0.28934874612070227,0.4036295731242805,0.15211190810296196,0.31230475713486433
|
|
|
6 |
Qwen2_5_7B_Instruct,0.2987576845890178,0.4163299367235864,0.1599063413842216,0.32003677565924527
|
7 |
Qwen2_5_1_5B_Instruct,0.2333120091694482,0.34339111721032756,0.10195887716459845,0.25458603313341865
|
8 |
Qwen2-72B-Instruct,0.2800906719573321,0.3887231369098802,0.15237661526996754,0.29917226369214855
|
9 |
+
Sailor2-8B-Chat,0.23525560304744508,0.34567892481583223,0.10170204161284628,0.2583858427136568
|
10 |
Meta-Llama-3-8B-Instruct,0.2846315092346869,0.39397110152251813,0.154320846916639,0.30560257926490364
|
11 |
merged_llama3_8b_sg_inst_avg_diff,0.2827552959388026,0.3953429193664384,0.14797005050571224,0.30495291794425716
|
12 |
Meta-Llama-3.1-70B-Instruct,0.28934874612070227,0.4036295731242805,0.15211190810296196,0.31230475713486433
|
results/emotion/zero_shot/ind_emotion.csv
CHANGED
@@ -6,6 +6,7 @@ Qwen2_5_32B_Instruct,0.6909090909090909
|
|
6 |
Qwen2_5_7B_Instruct,0.6636363636363637
|
7 |
Qwen2_5_1_5B_Instruct,0.5795454545454546
|
8 |
Qwen2-72B-Instruct,0.675
|
|
|
9 |
Meta-Llama-3-8B-Instruct,0.6522727272727272
|
10 |
merged_llama3_8b_sg_inst_avg_diff,0.7
|
11 |
Meta-Llama-3.1-70B-Instruct,0.7159090909090909
|
|
|
6 |
Qwen2_5_7B_Instruct,0.6636363636363637
|
7 |
Qwen2_5_1_5B_Instruct,0.5795454545454546
|
8 |
Qwen2-72B-Instruct,0.675
|
9 |
+
Sailor2-8B-Chat,0.7363636363636363
|
10 |
Meta-Llama-3-8B-Instruct,0.6522727272727272
|
11 |
merged_llama3_8b_sg_inst_avg_diff,0.7
|
12 |
Meta-Llama-3.1-70B-Instruct,0.7159090909090909
|
results/emotion/zero_shot/sst2.csv
CHANGED
@@ -6,6 +6,7 @@ Qwen2_5_32B_Instruct,0.9472477064220184
|
|
6 |
Qwen2_5_7B_Instruct,0.9254587155963303
|
7 |
Qwen2_5_1_5B_Instruct,0.9231651376146789
|
8 |
Qwen2-72B-Instruct,0.9346330275229358
|
|
|
9 |
Meta-Llama-3-8B-Instruct,0.8784403669724771
|
10 |
merged_llama3_8b_sg_inst_avg_diff,0.8841743119266054
|
11 |
Meta-Llama-3.1-70B-Instruct,0.9529816513761468
|
|
|
6 |
Qwen2_5_7B_Instruct,0.9254587155963303
|
7 |
Qwen2_5_1_5B_Instruct,0.9231651376146789
|
8 |
Qwen2-72B-Instruct,0.9346330275229358
|
9 |
+
Sailor2-8B-Chat,0.9461009174311926
|
10 |
Meta-Llama-3-8B-Instruct,0.8784403669724771
|
11 |
merged_llama3_8b_sg_inst_avg_diff,0.8841743119266054
|
12 |
Meta-Llama-3.1-70B-Instruct,0.9529816513761468
|
results/flores_translation/zero_shot/ind2eng.csv
CHANGED
@@ -6,6 +6,7 @@ Qwen2_5_32B_Instruct,0.3923422946746861
|
|
6 |
Qwen2_5_7B_Instruct,0.36472669481333536
|
7 |
Qwen2_5_1_5B_Instruct,0.2624938515155373
|
8 |
Qwen2-72B-Instruct,0.4043588265556185
|
|
|
9 |
Meta-Llama-3-8B-Instruct,0.33079891679041123
|
10 |
merged_llama3_8b_sg_inst_avg_diff,0.38376586000725804
|
11 |
Meta-Llama-3.1-70B-Instruct,0.43366494500251235
|
|
|
6 |
Qwen2_5_7B_Instruct,0.36472669481333536
|
7 |
Qwen2_5_1_5B_Instruct,0.2624938515155373
|
8 |
Qwen2-72B-Instruct,0.4043588265556185
|
9 |
+
Sailor2-8B-Chat,0.2487972955646591
|
10 |
Meta-Llama-3-8B-Instruct,0.33079891679041123
|
11 |
merged_llama3_8b_sg_inst_avg_diff,0.38376586000725804
|
12 |
Meta-Llama-3.1-70B-Instruct,0.43366494500251235
|
results/flores_translation/zero_shot/vie2eng.csv
CHANGED
@@ -6,6 +6,7 @@ Qwen2_5_32B_Instruct,0.33791529833420336
|
|
6 |
Qwen2_5_7B_Instruct,0.3027564749728372
|
7 |
Qwen2_5_1_5B_Instruct,0.21935649300365245
|
8 |
Qwen2-72B-Instruct,0.33005323227052946
|
|
|
9 |
Meta-Llama-3-8B-Instruct,0.2647448190950291
|
10 |
merged_llama3_8b_sg_inst_avg_diff,0.30900856944791294
|
11 |
Meta-Llama-3.1-70B-Instruct,0.37244508311079816
|
|
|
6 |
Qwen2_5_7B_Instruct,0.3027564749728372
|
7 |
Qwen2_5_1_5B_Instruct,0.21935649300365245
|
8 |
Qwen2-72B-Instruct,0.33005323227052946
|
9 |
+
Sailor2-8B-Chat,0.1825857920682635
|
10 |
Meta-Llama-3-8B-Instruct,0.2647448190950291
|
11 |
merged_llama3_8b_sg_inst_avg_diff,0.30900856944791294
|
12 |
Meta-Llama-3.1-70B-Instruct,0.37244508311079816
|
results/flores_translation/zero_shot/zho2eng.csv
CHANGED
@@ -6,6 +6,7 @@ Qwen2_5_32B_Instruct,0.26924811164378015
|
|
6 |
Qwen2_5_7B_Instruct,0.2437311220019033
|
7 |
Qwen2_5_1_5B_Instruct,0.18420680441018222
|
8 |
Qwen2-72B-Instruct,0.23893268538329387
|
|
|
9 |
Meta-Llama-3-8B-Instruct,0.199495011482748
|
10 |
merged_llama3_8b_sg_inst_avg_diff,0.24133164017585856
|
11 |
Meta-Llama-3.1-70B-Instruct,0.2832594176173152
|
|
|
6 |
Qwen2_5_7B_Instruct,0.2437311220019033
|
7 |
Qwen2_5_1_5B_Instruct,0.18420680441018222
|
8 |
Qwen2-72B-Instruct,0.23893268538329387
|
9 |
+
Sailor2-8B-Chat,0.16539980828035464
|
10 |
Meta-Llama-3-8B-Instruct,0.199495011482748
|
11 |
merged_llama3_8b_sg_inst_avg_diff,0.24133164017585856
|
12 |
Meta-Llama-3.1-70B-Instruct,0.2832594176173152
|
results/flores_translation/zero_shot/zsm2eng.csv
CHANGED
@@ -6,6 +6,7 @@ Qwen2_5_32B_Instruct,0.40310877536446654
|
|
6 |
Qwen2_5_7B_Instruct,0.3466422765302921
|
7 |
Qwen2_5_1_5B_Instruct,0.22890805100949677
|
8 |
Qwen2-72B-Instruct,0.40796892621611885
|
|
|
9 |
Meta-Llama-3-8B-Instruct,0.31625368345049
|
10 |
merged_llama3_8b_sg_inst_avg_diff,0.3729790018011108
|
11 |
Meta-Llama-3.1-70B-Instruct,0.4462132282683508
|
|
|
6 |
Qwen2_5_7B_Instruct,0.3466422765302921
|
7 |
Qwen2_5_1_5B_Instruct,0.22890805100949677
|
8 |
Qwen2-72B-Instruct,0.40796892621611885
|
9 |
+
Sailor2-8B-Chat,0.269986448536842
|
10 |
Meta-Llama-3-8B-Instruct,0.31625368345049
|
11 |
merged_llama3_8b_sg_inst_avg_diff,0.3729790018011108
|
12 |
Meta-Llama-3.1-70B-Instruct,0.4462132282683508
|
results/fundamental_nlp_tasks/zero_shot/cola.csv
CHANGED
@@ -6,6 +6,7 @@ Qwen2_5_32B_Instruct,0.8427612655800575
|
|
6 |
Qwen2_5_7B_Instruct,0.7909875359539789
|
7 |
Qwen2_5_1_5B_Instruct,0.7497603068072867
|
8 |
Qwen2-72B-Instruct,0.8341323106423778
|
|
|
9 |
Meta-Llama-3-8B-Instruct,0.6548418024928092
|
10 |
merged_llama3_8b_sg_inst_avg_diff,0.6174496644295302
|
11 |
Meta-Llama-3.1-70B-Instruct,0.850431447746884
|
|
|
6 |
Qwen2_5_7B_Instruct,0.7909875359539789
|
7 |
Qwen2_5_1_5B_Instruct,0.7497603068072867
|
8 |
Qwen2-72B-Instruct,0.8341323106423778
|
9 |
+
Sailor2-8B-Chat,0.7900287631831256
|
10 |
Meta-Llama-3-8B-Instruct,0.6548418024928092
|
11 |
merged_llama3_8b_sg_inst_avg_diff,0.6174496644295302
|
12 |
Meta-Llama-3.1-70B-Instruct,0.850431447746884
|
results/fundamental_nlp_tasks/zero_shot/mnli.csv
CHANGED
@@ -6,6 +6,7 @@ Qwen2_5_32B_Instruct,0.8715
|
|
6 |
Qwen2_5_7B_Instruct,0.8105
|
7 |
Qwen2_5_1_5B_Instruct,0.6045
|
8 |
Qwen2-72B-Instruct,0.7925
|
|
|
9 |
Meta-Llama-3-8B-Instruct,0.546
|
10 |
merged_llama3_8b_sg_inst_avg_diff,0.5375
|
11 |
Meta-Llama-3.1-70B-Instruct,0.7015
|
|
|
6 |
Qwen2_5_7B_Instruct,0.8105
|
7 |
Qwen2_5_1_5B_Instruct,0.6045
|
8 |
Qwen2-72B-Instruct,0.7925
|
9 |
+
Sailor2-8B-Chat,0.664
|
10 |
Meta-Llama-3-8B-Instruct,0.546
|
11 |
merged_llama3_8b_sg_inst_avg_diff,0.5375
|
12 |
Meta-Llama-3.1-70B-Instruct,0.7015
|
results/fundamental_nlp_tasks/zero_shot/mrpc.csv
CHANGED
@@ -6,6 +6,7 @@ Qwen2_5_32B_Instruct,0.7745098039215687
|
|
6 |
Qwen2_5_7B_Instruct,0.7058823529411765
|
7 |
Qwen2_5_1_5B_Instruct,0.6838235294117647
|
8 |
Qwen2-72B-Instruct,0.8063725490196079
|
|
|
9 |
Meta-Llama-3-8B-Instruct,0.678921568627451
|
10 |
merged_llama3_8b_sg_inst_avg_diff,0.6274509803921569
|
11 |
Meta-Llama-3.1-70B-Instruct,0.7696078431372549
|
|
|
6 |
Qwen2_5_7B_Instruct,0.7058823529411765
|
7 |
Qwen2_5_1_5B_Instruct,0.6838235294117647
|
8 |
Qwen2-72B-Instruct,0.8063725490196079
|
9 |
+
Sailor2-8B-Chat,0.7769607843137255
|
10 |
Meta-Llama-3-8B-Instruct,0.678921568627451
|
11 |
merged_llama3_8b_sg_inst_avg_diff,0.6274509803921569
|
12 |
Meta-Llama-3.1-70B-Instruct,0.7696078431372549
|
results/fundamental_nlp_tasks/zero_shot/ocnli.csv
CHANGED
@@ -6,6 +6,7 @@ Qwen2_5_32B_Instruct,0.7742372881355932
|
|
6 |
Qwen2_5_7B_Instruct,0.6732203389830509
|
7 |
Qwen2_5_1_5B_Instruct,0.5135593220338983
|
8 |
Qwen2-72B-Instruct,0.7820338983050847
|
|
|
9 |
Meta-Llama-3-8B-Instruct,0.44033898305084745
|
10 |
merged_llama3_8b_sg_inst_avg_diff,0.4633898305084746
|
11 |
Meta-Llama-3.1-70B-Instruct,0.6423728813559322
|
|
|
6 |
Qwen2_5_7B_Instruct,0.6732203389830509
|
7 |
Qwen2_5_1_5B_Instruct,0.5135593220338983
|
8 |
Qwen2-72B-Instruct,0.7820338983050847
|
9 |
+
Sailor2-8B-Chat,0.5569491525423729
|
10 |
Meta-Llama-3-8B-Instruct,0.44033898305084745
|
11 |
merged_llama3_8b_sg_inst_avg_diff,0.4633898305084746
|
12 |
Meta-Llama-3.1-70B-Instruct,0.6423728813559322
|
results/fundamental_nlp_tasks/zero_shot/qnli.csv
CHANGED
@@ -6,6 +6,7 @@ Qwen2_5_32B_Instruct,0.9062786015010068
|
|
6 |
Qwen2_5_7B_Instruct,0.8652754896576972
|
7 |
Qwen2_5_1_5B_Instruct,0.6148636280431997
|
8 |
Qwen2-72B-Instruct,0.8887058392824455
|
|
|
9 |
Meta-Llama-3-8B-Instruct,0.6025993044114956
|
10 |
merged_llama3_8b_sg_inst_avg_diff,0.6522057477576423
|
11 |
Meta-Llama-3.1-70B-Instruct,0.9026176093721399
|
|
|
6 |
Qwen2_5_7B_Instruct,0.8652754896576972
|
7 |
Qwen2_5_1_5B_Instruct,0.6148636280431997
|
8 |
Qwen2-72B-Instruct,0.8887058392824455
|
9 |
+
Sailor2-8B-Chat,0.6822258832143511
|
10 |
Meta-Llama-3-8B-Instruct,0.6025993044114956
|
11 |
merged_llama3_8b_sg_inst_avg_diff,0.6522057477576423
|
12 |
Meta-Llama-3.1-70B-Instruct,0.9026176093721399
|
results/fundamental_nlp_tasks/zero_shot/qqp.csv
CHANGED
@@ -6,6 +6,7 @@ Qwen2_5_32B_Instruct,0.8315
|
|
6 |
Qwen2_5_7B_Instruct,0.76
|
7 |
Qwen2_5_1_5B_Instruct,0.731
|
8 |
Qwen2-72B-Instruct,0.8065
|
|
|
9 |
Meta-Llama-3-8B-Instruct,0.563
|
10 |
merged_llama3_8b_sg_inst_avg_diff,0.597
|
11 |
Meta-Llama-3.1-70B-Instruct,0.815
|
|
|
6 |
Qwen2_5_7B_Instruct,0.76
|
7 |
Qwen2_5_1_5B_Instruct,0.731
|
8 |
Qwen2-72B-Instruct,0.8065
|
9 |
+
Sailor2-8B-Chat,0.8205
|
10 |
Meta-Llama-3-8B-Instruct,0.563
|
11 |
merged_llama3_8b_sg_inst_avg_diff,0.597
|
12 |
Meta-Llama-3.1-70B-Instruct,0.815
|
results/fundamental_nlp_tasks/zero_shot/rte.csv
CHANGED
@@ -6,6 +6,7 @@ Qwen2_5_32B_Instruct,0.9097472924187726
|
|
6 |
Qwen2_5_7B_Instruct,0.8592057761732852
|
7 |
Qwen2_5_1_5B_Instruct,0.703971119133574
|
8 |
Qwen2-72B-Instruct,0.8447653429602888
|
|
|
9 |
Meta-Llama-3-8B-Instruct,0.6173285198555957
|
10 |
merged_llama3_8b_sg_inst_avg_diff,0.6606498194945848
|
11 |
Meta-Llama-3.1-70B-Instruct,0.8483754512635379
|
|
|
6 |
Qwen2_5_7B_Instruct,0.8592057761732852
|
7 |
Qwen2_5_1_5B_Instruct,0.703971119133574
|
8 |
Qwen2-72B-Instruct,0.8447653429602888
|
9 |
+
Sailor2-8B-Chat,0.8122743682310469
|
10 |
Meta-Llama-3-8B-Instruct,0.6173285198555957
|
11 |
merged_llama3_8b_sg_inst_avg_diff,0.6606498194945848
|
12 |
Meta-Llama-3.1-70B-Instruct,0.8483754512635379
|
results/fundamental_nlp_tasks/zero_shot/wnli.csv
CHANGED
@@ -6,6 +6,7 @@ Qwen2_5_32B_Instruct,0.8732394366197183
|
|
6 |
Qwen2_5_7B_Instruct,0.7605633802816901
|
7 |
Qwen2_5_1_5B_Instruct,0.4647887323943662
|
8 |
Qwen2-72B-Instruct,0.8873239436619719
|
|
|
9 |
Meta-Llama-3-8B-Instruct,0.4788732394366197
|
10 |
merged_llama3_8b_sg_inst_avg_diff,0.5492957746478874
|
11 |
Meta-Llama-3.1-70B-Instruct,0.8450704225352113
|
|
|
6 |
Qwen2_5_7B_Instruct,0.7605633802816901
|
7 |
Qwen2_5_1_5B_Instruct,0.4647887323943662
|
8 |
Qwen2-72B-Instruct,0.8873239436619719
|
9 |
+
Sailor2-8B-Chat,0.5492957746478874
|
10 |
Meta-Llama-3-8B-Instruct,0.4788732394366197
|
11 |
merged_llama3_8b_sg_inst_avg_diff,0.5492957746478874
|
12 |
Meta-Llama-3.1-70B-Instruct,0.8450704225352113
|
results/general_reasoning/zero_shot/c_eval.csv
CHANGED
@@ -6,6 +6,7 @@ Qwen2_5_32B_Instruct,0.8262764632627646
|
|
6 |
Qwen2_5_7B_Instruct,0.7459526774595268
|
7 |
Qwen2_5_1_5B_Instruct,0.5971357409713575
|
8 |
Qwen2-72B-Instruct,0.8312577833125778
|
|
|
9 |
Meta-Llama-3-8B-Instruct,0.4775840597758406
|
10 |
merged_llama3_8b_sg_inst_avg_diff,0.5205479452054794
|
11 |
Meta-Llama-3.1-70B-Instruct,0.6612702366127023
|
|
|
6 |
Qwen2_5_7B_Instruct,0.7459526774595268
|
7 |
Qwen2_5_1_5B_Instruct,0.5971357409713575
|
8 |
Qwen2-72B-Instruct,0.8312577833125778
|
9 |
+
Sailor2-8B-Chat,0.5946450809464509
|
10 |
Meta-Llama-3-8B-Instruct,0.4775840597758406
|
11 |
merged_llama3_8b_sg_inst_avg_diff,0.5205479452054794
|
12 |
Meta-Llama-3.1-70B-Instruct,0.6612702366127023
|
results/general_reasoning/zero_shot/cmmlu.csv
CHANGED
@@ -6,6 +6,7 @@ Qwen2_5_32B_Instruct,0.8273182524607149
|
|
6 |
Qwen2_5_7B_Instruct,0.7486617164565705
|
7 |
Qwen2_5_1_5B_Instruct,0.5975651873596961
|
8 |
Qwen2-72B-Instruct,0.8293904334311863
|
|
|
9 |
Meta-Llama-3-8B-Instruct,0.4839405974788465
|
10 |
merged_llama3_8b_sg_inst_avg_diff,0.5171818338801588
|
11 |
Meta-Llama-3.1-70B-Instruct,0.6814885166637886
|
|
|
6 |
Qwen2_5_7B_Instruct,0.7486617164565705
|
7 |
Qwen2_5_1_5B_Instruct,0.5975651873596961
|
8 |
Qwen2-72B-Instruct,0.8293904334311863
|
9 |
+
Sailor2-8B-Chat,0.6416853738559835
|
10 |
Meta-Llama-3-8B-Instruct,0.4839405974788465
|
11 |
merged_llama3_8b_sg_inst_avg_diff,0.5171818338801588
|
12 |
Meta-Llama-3.1-70B-Instruct,0.6814885166637886
|
results/general_reasoning/zero_shot/mmlu.csv
CHANGED
@@ -6,6 +6,7 @@ Qwen2_5_32B_Instruct,0.7996424740793707
|
|
6 |
Qwen2_5_7B_Instruct,0.6935287808366106
|
7 |
Qwen2_5_1_5B_Instruct,0.5646764390418305
|
8 |
Qwen2-72B-Instruct,0.7922774401144083
|
|
|
9 |
Meta-Llama-3-8B-Instruct,0.6005720414730068
|
10 |
merged_llama3_8b_sg_inst_avg_diff,0.631748301751877
|
11 |
Meta-Llama-3.1-70B-Instruct,0.8058634250983197
|
|
|
6 |
Qwen2_5_7B_Instruct,0.6935287808366106
|
7 |
Qwen2_5_1_5B_Instruct,0.5646764390418305
|
8 |
Qwen2-72B-Instruct,0.7922774401144083
|
9 |
+
Sailor2-8B-Chat,0.6202359671076153
|
10 |
Meta-Llama-3-8B-Instruct,0.6005720414730068
|
11 |
merged_llama3_8b_sg_inst_avg_diff,0.631748301751877
|
12 |
Meta-Llama-3.1-70B-Instruct,0.8058634250983197
|
results/general_reasoning/zero_shot/zbench.csv
CHANGED
@@ -6,6 +6,7 @@ Qwen2_5_32B_Instruct,0.6060606060606061
|
|
6 |
Qwen2_5_7B_Instruct,0.6666666666666666
|
7 |
Qwen2_5_1_5B_Instruct,0.42424242424242425
|
8 |
Qwen2-72B-Instruct,0.5757575757575758
|
|
|
9 |
Meta-Llama-3-8B-Instruct,0.3333333333333333
|
10 |
merged_llama3_8b_sg_inst_avg_diff,0.42424242424242425
|
11 |
Meta-Llama-3.1-70B-Instruct,0.48484848484848486
|
|
|
6 |
Qwen2_5_7B_Instruct,0.6666666666666666
|
7 |
Qwen2_5_1_5B_Instruct,0.42424242424242425
|
8 |
Qwen2-72B-Instruct,0.5757575757575758
|
9 |
+
Sailor2-8B-Chat,0.5151515151515151
|
10 |
Meta-Llama-3-8B-Instruct,0.3333333333333333
|
11 |
merged_llama3_8b_sg_inst_avg_diff,0.42424242424242425
|
12 |
Meta-Llama-3.1-70B-Instruct,0.48484848484848486
|