chinese_benchmark_gen.csv ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Model,Size,Accuracy/std,Precision_Unsafe/std,Recall_Unsafe/std,Precision_Safe/std,Recall_Safe/std
2
+ GPT-4o,API,73.78/0.30,97.75/0.13,48.66/0.04,65.84/0.55,98.88/0.04
3
+ GPT-4-Turbo,API,71.67/0.17,80.13/0.64,57.59/0.69,66.93/0.44,85.74/0.35
4
+ Pespective,API,69.28/0.32,69.96/0.79,67.49/0.32,68.64/0.32,71.06/0.43
5
+ GPT-3.5,API,64.70/0.44,76.12/0.55,42.79/0.64,60.24/0.76,86.59/0.32
6
+ Gemini-2.5-flash-preview-05-20,API,71.27/0.27,73.40/0.23,70.16/0.71,69.17/0.53,72.48/0.40
7
+ Llama-4-maverick,API,75.02/0.03,62.35/0.10,83.53/0.03,87.71/0.04,69.96/0.04
8
+ Gemini-2.0-flash-001,API,52.04/0.61,0.95/0.05,69.46/0.38,99.60/0.03,51.93/0.62
9
+ Deepseek-chat-v3-0324,API,66.00/0.11,45.08/0.11,77.52/0.19,86.93/0.11,61.28/0.08
10
+ Phi-3-small-8k-instruct,5B~10B,72.73/0.47,73.67/0.63,71.12/0.49,71.85/0.35,74.36/0.59
11
+ Gemma-1.1-7B-it,5B~10B,71.70/0.26,68.66/0.37,80.11/0.05,76.00/0.09,63.26/0.47
12
+ DeepSeek-LLM-7B-Chat,5B~10B,71.63/0.17,69.50/0.15,77.33/0.67,74.33/0.41,65.90/0.38
13
+ GLM-4-9B-Chat,5B~10B,70.96/0.23,82.15/0.55,53.73/0.48,65.50/0.18,88.27/0.41
14
+ Mistral-7B-Instruct-v0.3,5B~10B,70.41/0.41,68.55/0.52,75.67/0.22,72.71/0.26,65.12/0.58
15
+ Qwen1.5-7B-Chat,5B~10B,70.36/0.39,64.66/0.27,90.09/0.57,83.55/0.82,50.53/0.18
16
+ Phi-3-small-128k-instruct,5B~10B,67.43/0.26,72.10/0.54,57.35/0.17,64.33/0.09,77.61/0.43
17
+ Ministral-8B-Instruct-2410,5B~10B,62.32/0.01,62.71/0.19,61.60/0.29,61.94/0.19,63.05/0.28
18
+ Yi-1.5-9B-Chat,5B~10B,62.12/0.38,64.42/0.42,54.53/0.43,60.43/0.36,69.75/0.37
19
+ Llama3-ChatQA-1.5-8B,5B~10B,61.28/0.40,57.63/0.20,85.84/0.43,72.02/0.95,36.61/0.54
20
+ Baichuan2-7B-Chat,5B~10B,59.43/0.24,72.06/0.66,31.11/0.40,55.95/0.12,87.89/0.20
21
+ InternLM2-chat-7B,5B~10B,58.79/0.09,62.70/0.19,43.88/0.17,56.68/0.14,73.77/0.13
22
+ GPT-J-6B,5B~10B,52.65/0.32,52.42/0.32,62.00/0.42,52.99/0.37,43.21/0.92
23
+ Opt-6.7B,5B~10B,50.00/0.11,50.17/0.17,64.70/0.35,49.69/0.04,35.18/0.44
24
+ Qwen3-4B,5B~10B,74.95/0.01,76.47/0.01,72.10/0.00,73.61/0.01,77.81/0.01
25
+ Gemma-3-4B-it,5B~10B,71.41/0.00,66.54/0.00,86.12/0.00,80.33/0.00,56.70/0.00
26
+ phi-4,10B~20B,72.24/0.24,76.59/0.46,64.42/0.51,69.06/0.15,80.13/0.62
27
+ InternLM2-Chat-20B,10B~20B,70.21/0.55,73.30/0.70,63.79/0.43,67.82/0.45,76.65/0.67
28
+ Qwen1.5-14B-Chat,10B~20B,68.25/0.44,65.87/0.37,76.02/0.72,71.51/0.59,60.44/0.20
29
+ Phi-3-medium-128k-instruct,10B~20B,64.30/0.06,63.89/0.13,66.53/0.52,64.76/0.26,62.05/0.42
30
+ Baichuan2-13B-Chat,10B~20B,62.86/0.31,64.17/0.33,58.61/0.80,61.75/0.30,67.13/0.56
31
+ Mistral-Nemo-Instruct-2407,10B~20B,59.71/0.45,61.79/0.52,51.82/0.48,58.20/0.44,67.68/0.44
32
+ Phi-3-medium-4k-instruct,10B~20B,57.79/0.45,58.69/0.37,53.88/0.62,57.02/0.55,61.74/0.55
33
+ Ziya2-13B-Chat,10B~20B,53.40/0.43,53.33/0.38,56.18/0.41,53.48/0.53,50.62/0.61
34
+ Opt-13B,10B~20B,50.18/0.26,50.29/0.20,69.97/0.37,49.94/0.47,30.22/0.31
35
+ Moonlight-16B-A3B-Instruct,10B~20B,45.16/0.43,44.16/0.64,34.79/0.67,45.82/0.33,55.62/0.35
36
+ Qwen3-14B,10B~20B,68.54/0.01,67.24/0.01,72.29/0.00,70.04/0.00,64.78/0.01
37
+ Gemma-3-12B-it,10B~20B,65.63/0.00,62.69/0.00,77.18/0.00,70.32/0.00,54.07/0.00
38
+ DeepSeek-LLM-67B-Chat,>65B,76.76/0.35,73.40/0.37,84.26/0.40,81.34/0.35,69.19/0.64
39
+ Llama3-ChatQA-1.5-70B,>65B,65.29/0.29,66.24/0.50,62.92/0.12,64.43/0.19,67.69/0.63
40
+ Qwen2.5-72B-Instruct,>65B,63.41/0.77,66.00/0.95,56.00/0.62,61.49/0.65,70.90/0.96
41
+ Qwen1.5-72B-Chat,>65B,62.91/0.50,73.86/0.84,40.46/0.97,58.75/0.35,85.55/0.62
42
+ Opt-66B,>65B,54.46/0.17,53.22/0.06,76.94/0.24,57.73/0.49,31.77/0.28
43
+ Qwen2-72B-Instruct,>65B,54.08/0.20,58.10/0.60,30.72/0.45,52.63/0.05,77.65/0.36
44
+ DeepSeek-R1-Distill-Llama-70B,>65B,52.93/0.18,59.69/0.47,19.33/0.38,51.62/0.16,86.83/0.18
45
+ Llama-3.1-70B-Instruct,>65B,52.84/0.38,59.07/1.22,19.82/0.85,51.57/0.24,86.14/0.58
46
+ Llama-3.3-70B-Instruct,>65B,50.87/0.07,54.51/0.86,13.19/0.10,50.37/0.06,88.89/0.39
47
+ Qwen3-32B,>65B,75.26/0.00,89.11/0.00,57.55/0.0,68.65/0.00,92.97/0.00
48
+ Qwen2.5-32B-Instruct,~30B,69.64/0.39,92.13/0.45,43.24/0.83,62.70/0.25,96.27/0.20
49
+ QwQ-32B-Preview,~30B,69.55/0.28,75.97/0.48,57.60/0.27,65.61/0.17,81.62/0.33
50
+ Mistral-Small-24B-Instruct-2501,~30B,64.48/0.17,64.61/0.35,64.71/0.72,64.34/0.00,64.23/1.04
51
+ Yi-1.5-34B-Chat,~30B,60.06/0.43,58.14/0.40,72.51/0.55,63.27/0.56,47.56/0.42
52
+ Opt-30B,~30B,50.88/0.11,50.76/0.12,72.95/0.16,51.18/0.26,28.62/0.28
53
+ Gemma-3-27B-it,~30B,68.50/0.00,68.37/0.00,68.84/0.00,68.62/0.00,68.15/0.00
54
+ OpenThinker2-32B,~30B,65.01/0.01,74.90/0.01,45.13/0.01,60.74/0.01,84.87/0.00
chinese_benchmark_per.csv ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Model,Size,Accuracy/std,Precision_Unsafe/std,Recall_Unsafe/std,Precision_Safe/std,Recall_Safe/std
2
+ Yi-1.5-34B-Chat,~30B,66.02/0.22,80.13/0.55,42.82/0.25,60.86/0.16,89.33/0.41
3
+ Qwen2.5-32B-Instruct,~30B,64.33/0.46,62.46/0.44,72.24/0.71,66.91/0.53,56.38/0.18
4
+ Opt-30B,~30B,53.82/0.03,54.42/0.21,48.32/0.20,53.34/0.11,59.34/0.27
5
+ QwQ-32B-Preview,~30B,51.82/0.06,51.04/0.10,94.83/0.28,62.38/0.26,8.61/0.39
6
+ Gemma-3-27B-it,~30B,50.00/0.00,0.00/0.00,0.00/0.00,50.00/0.00,100.00/0.00
7
+ Qwen3-32B,~30B,49.66/0.00,49.83/0.00,99.03/0.00,22.40/0.00,0.28/0.00
8
+ OpenThinker2-32B,~30B,49.91/0.00,49.95/0.00,98.26/0.00,47.27/0.00,1.56/0.00
9
+ DeepSeek-LLM-67B-Chat,>65B,68.08/0.35,94.80/0.83,38.40/0.43,61.27/0.26,97.88/0.36
10
+ Qwen1.5-72B-Chat,>65B,63.67/0.46,58.27/0.32,96.84/0.13,90.51/0.57,30.34/0.80
11
+ Qwen2.5-72B-Instruct,>65B,63.27/0.52,66.00/0.60,55.09/0.82,61.31/0.46,71.49/0.25
12
+ Qwen2-72B-Instruct,>65B,60.70/0.49,57.90/0.42,79.03/0.63,66.75/0.77,42.28/0.43
13
+ Opt-66B,>65B,59.93/0.41,56.52/0.37,86.87/0.59,71.36/0.78,32.86/0.74
14
+ DeepSeek-R1-Distill-Llama-70B,>65B,47.68/0.64,45.77/1.21,23.85/0.67,48.35/0.46,71.62/0.60
15
+ Llama-3.1-70B-Instruct,>65B,43.68/0.41,36.45/0.84,16.66/0.34,45.83/0.30,70.82/0.48
16
+ Llama3-ChatQA-1.5-70B,>65B,40.41/0.29,33.86/0.75,19.84/0.75,43.13/0.25,61.08/0.37
17
+ Llama-3.3-70B-Instruct,>65B,36.84/0.82,32.02/1.29,23.19/1.13,39.58/0.63,50.55/0.69
18
+ Phi-3-medium-4k-instruct,10B~20B,71.04/0.31,69.74/0.29,74.56/0.97,72.54/0.59,67.49/0.89
19
+ Baichuan2-13B-Chat,10B~20B,70.43/0.39,65.81/0.38,85.34/0.63,79.02/0.63,55.46/0.47
20
+ Phi-3-medium-128k-instruct,10B~20B,68.87/0.81,68.08/0.51,71.32/1.44,69.75/1.17,66.41/0.57
21
+ Mistral-Nemo-Instruct-2407,10B~20B,66.88/0.46,62.56/0.28,84.42/0.90,75.89/1.13,49.26/0.24
22
+ phi-4,10B~20B,62.62/0.32,63.73/0.41,58.98/0.20,61.66/0.31,66.28/0.78
23
+ Qwen1.5-14B-Chat,10B~20B,61.29/0.40,57.02/0.32,92.43/0.55,79.80/1.05,30.02/0.47
24
+ Mistral-Small-24B-Instruct-2501,10B~20B,59.20/0.46,58.32/0.42,65.16/1.08,60.33/0.56,53.22/0.20
25
+ Ziya2-13B-Chat,10B~20B,55.25/0.26,59.24/0.37,34.30/0.11,53.61/0.26,76.29/0.39
26
+ InternLM2-Chat-20B,10B~20B,53.67/0.16,79.00/0.66,10.30/0.60,51.90/0.11,97.25/0.26
27
+ Opt-13B,10B~20B,49.31/0.31,37.77/3.57,1.76/0.16,49.59/0.23,97.08/0.29
28
+ Moonlight-16B-A3B-Instruct,10B~20B,48.92/0.16,3.46/0.57,0.07/0.01,49.40/0.15,98.00/0.08
29
+ Qwen3-14B,10B~20B,48.34/0.00,49.14/0.00,95.13/0.00,24.26/0.00,1.56/0.00
30
+ Gemma-3-12B-it,10B~20B,50.00/0.00,0.00/0.00,0.00/0.00,50.00/0.00,100.00/0.00
31
+ Gemma-1.1-7B-it,5B~10B,64.32/0.68,59.98/0.58,86.60/0.35,75.70/0.80,41.95/0.93
32
+ Qwen1.5-7B-Chat,5B~10B,62.48/0.54,59.06/0.48,81.92/0.50,70.28/0.65,42.96/0.81
33
+ Phi-3-small-128k-instruct,5B~10B,61.76/0.27,60.47/0.16,68.45/0.61,63.46/0.50,55.05/0.61
34
+ Yi-1.5-9B-Chat,5B~10B,60.35/0.52,79.47/1.37,28.16/0.33,56.22/0.39,92.69/0.59
35
+ Phi-3-small-8k-instruct,5B~10B,59.47/0.39,56.25/0.30,86.06/0.40,70.05/0.85,32.75/0.49
36
+ DeepSeek-LLM-7B-Chat,5B~10B,56.79/0.19,84.83/1.23,16.77/0.09,53.70/0.15,96.99/0.27
37
+ Ministral-8B-Instruct-2410,5B~10B,56.28/0.51,55.10/0.51,68.83/0.58,58.24/0.51,43.66/0.54
38
+ GPT-J-6B,5B~10B,55.98/0.42,80.27/1.42,16.11/0.86,53.26/0.23,96.03/0.20
39
+ Baichuan2-7B-Chat,5B~10B,53.99/0.51,62.89/1.57,19.96/0.88,52.31/0.30,88.18/0.23
40
+ GLM-4-9B-Chat,5B~10B,50.03/0.15,50.07/0.13,99.31/0.22,44.12/9.01,0.52/0.04
41
+ InternLM2-Chat-7B,5B~10B,49.49/0.11,42.16/1.58,2.15/0.31,49.68/0.13,97.06/0.25
42
+ Opt-6.7B,5B~10B,48.54/0.43,49.24/0.31,86.62/1.03,43.40/1.18,10.30/0.55
43
+ Mistral-7B-Instruct-v0.3,5B~10B,42.99/0.06,39.54/0.47,26.01/0.69,44.69/0.11,60.05/0.50
44
+ Llama3-ChatQA-1.5-8B,5B~10B,42.11/0.29,37.46/0.85,23.20/0.89,44.20/0.09,61.11/0.57
45
+ Qwen3-4B,5B~10B,46.04/0.00,47.79/0.00,85.94/0.00,30.39/0.00,6.14/0.00
46
+ Gemma-3-4B-it,5B~10B,50.00/0.00,0.00/0.00,0.00/0.00,50.00/0.00,100.00/0.00