Spaces:

SUSTech
/

ChineseSafe-Benchmark

Running

App Files Files Community

feat: update new models

#15

by Jerry0723 - opened Jul 1

base: refs/heads/main

←

from: refs/pr/15

Discussion Files changed

+174

-374

Files changed (9) hide show

.idea/workspace.xml +0 -58
app.py +35 -82
assets/text.py +3 -3
changelog.md +1 -26
data/ChineseGuardBench.csv +0 -33
data/chinese_benchmark_gen.csv +43 -56
data/chinese_benchmark_per.csv +39 -46
data/subclass_gen.csv +26 -38
data/subclass_per.csv +27 -32

.idea/workspace.xml DELETED Viewed

@@ -1,58 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<project version="4">
-  <component name="ChangeListManager">
-    <list default="true" id="60da6b73-38f4-48aa-bd78-5731d35b3a7c" name="Changes" comment="" />
-    <option name="SHOW_DIALOG" value="false" />
-    <option name="HIGHLIGHT_CONFLICTS" value="true" />
-    <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
-    <option name="LAST_RESOLUTION" value="IGNORE" />
-  </component>
-  <component name="Git.Settings">
-    <option name="RECENT_GIT_ROOT_PATH" value="$PROJECT_DIR$" />
-  </component>
-  <component name="MarkdownSettingsMigration">
-    <option name="stateVersion" value="1" />
-  </component>
-  <component name="ProjectColorInfo">{
-  &quot;customColor&quot;: &quot;&quot;,
-  &quot;associatedIndex&quot;: 2
-}</component>
-  <component name="ProjectId" id="2zGmpeKAt5GZlNtHRIRD45uRoxd" />
-  <component name="ProjectViewState">
-    <option name="hideEmptyMiddlePackages" value="true" />
-    <option name="showLibraryContents" value="true" />
-  </component>
-  <component name="PropertiesComponent"><![CDATA[{
-  "keyToString": {
-    "RunOnceActivity.OpenProjectViewOnStart": "true",
-    "RunOnceActivity.ShowReadmeOnStart": "true",
-    "git-widget-placeholder": "pr/18",
-    "last_opened_file_path": "E:/pythonProject/ChineseSafe-Benchmark",
-    "nodejs_package_manager_path": "npm",
-    "vue.rearranger.settings.migration": "true"
-  }
-}]]></component>
-  <component name="SharedIndexes">
-    <attachedChunks>
-      <set>
-        <option value="bundled-python-sdk-67fca87a943a-c986f194a52a-com.jetbrains.pycharm.pro.sharedIndexes.bundled-PY-233.11799.259" />
-      </set>
-    </attachedChunks>
-  </component>
-  <component name="SpellCheckerSettings" RuntimeDictionaries="0" Folders="0" CustomDictionaries="0" DefaultDictionary="application-level" UseSingleDictionary="true" transferred="true" />
-  <component name="TaskManager">
-    <task active="true" id="Default" summary="Default task">
-      <changelist id="60da6b73-38f4-48aa-bd78-5731d35b3a7c" name="Changes" comment="" />
-      <created>1751365967779</created>
-      <option name="number" value="Default" />
-      <option name="presentableId" value="Default" />
-      <updated>1751365967779</updated>
-      <workItem from="1751365968934" duration="39000" />
-      <workItem from="1751366116696" duration="54000" />
-    </task>
-    <servers />
-  </component>
-  <component name="TypeScriptGeneratedFilesManager">
-    <option name="version" value="3" />
-  </component>
-</project>

app.py CHANGED Viewed

@@ -6,16 +6,15 @@ import pandas as pd
 from assets.text import INTRODUCTION_TEXT, METRICS_TEXT, EVALUTION_TEXT, ACKNOWLEDGEMENTS_TEXT, REFERENCE_TEXT
-ORIGINAL_DF = pd.read_csv("./data/chinese_benchmark_gen.csv", encoding='utf-8') # space separated values
-ORIGINAL_DF_PER = pd.read_csv("./data/chinese_benchmark_per.csv", encoding='utf-8') #
-ORIGINAL_DF_SUB_GEN = pd.read_csv("./data/subclass_gen.csv", encoding='utf-8') #
-ORIGINAL_DF_SUB_PER = pd.read_csv("./data/subclass_per.csv", encoding='utf-8')
-ORIGINAL_DF_NEW = pd.read_csv("./data/ChineseGuardBench.csv", encoding='utf-8') # new table
 METRICS = ["Accuracy", "Precision_Unsafe", "Recall_Unsafe", "Precision_Safe", "Recall_Safe", "None"]
 SUBCLASS = ["Discrimination", "Variant", "Psychology", "Politics", "Eroticism", "Vulgarity", "Property", "Injury", "Criminality", "Ethics"]
 #SPLITS = ["Overall", "Subclass"]
@@ -27,10 +26,9 @@ CLASSIFICATION = {
         "~30B",
         "10B~20B",
         "5B~10B",
-        "1B~5B",
         "API",
     ]
 }
@@ -38,17 +36,17 @@ CLASSIFICATION = {
 _BIBTEX = """
 @misc{zhang2024chinesesafechinesebenchmarkevaluating,
-      title={ChineseSafe: A Chinese Benchmark for Evaluating Safety in Large Language Models},
       author={Hengxiang Zhang and Hongfu Gao and Qiang Hu and Guanhua Chen and Lili Yang and Bingyi Jing and Hongxin Wei and Bing Wang and Haifeng Bai and Lei Yang},
       year={2024},
       eprint={2410.18491},
       archivePrefix={arXiv},
       primaryClass={cs.CL},
-      url={https://arxiv.org/abs/2410.18491},
 }
 """
-_LAST_UPDATED = "July 28, 2025"
 banner_url = "./assets/logo.png"
 _BANNER = f'<div style="display: flex; justify-content: space-around;"><img src="{banner_url}" alt="Banner" style="width: 40vw; min-width: 300px; max-width: 600px;"> </div>'  # noqa
@@ -64,31 +62,18 @@ def format_csv_numbers(text):
 def format_csv_numbers_second(text):
     return text.split()
 def format_number(x):
     return float(f"{x:.3}")
-def get_dataset_new_csv(
-    model_size: List[str],
-):
-    df = ORIGINAL_DF_NEW[ORIGINAL_DF_NEW['Size'].isin(model_size)]
-    df = df.drop(columns="Size")
-    leaderboard_table = gr.components.Dataframe(
-        value=df,
-        interactive=False,
-        visible=True,
-    )
-    return leaderboard_table
 def get_dataset_csv(
     model_size: List[str],
 ):
     df = ORIGINAL_DF[ORIGINAL_DF['Size'].isin(model_size)]
     df = df.drop(columns="Size")
     leaderboard_table = gr.components.Dataframe(
         value=df,
         interactive=False,
@@ -116,11 +101,11 @@ def get_dataset_csv_sub_gen(
 ):
     df = ORIGINAL_DF_SUB_GEN[ORIGINAL_DF_SUB_GEN['Size'].isin(model_size)]
     df = df.drop(columns="Size")
     # get subclass
     subclass_choice_label = ["Model", subclass_choice+"_Accuracy", subclass_choice+"_Precision", subclass_choice+"_Recall"]
     df = df[subclass_choice_label]
     leaderboard_table = gr.components.Dataframe(
         value=df,
         interactive=False,
@@ -135,11 +120,11 @@ def get_dataset_csv_sub_per(
 ):
     df = ORIGINAL_DF_SUB_PER[ORIGINAL_DF_SUB_PER['Size'].isin(model_size)]
     df = df.drop(columns="Size")
     # get subclass
     subclass_choice_label = ["Model", subclass_choice+"_Accuracy", subclass_choice+"_Precision", subclass_choice+"_Recall"]
     df = df[subclass_choice_label]
     leaderboard_table = gr.components.Dataframe(
         value=df,
         interactive=False,
@@ -158,15 +143,7 @@ def get_dataset_classfier_gen(
         subclass_choice = main_choice
         leaderboard_table = get_dataset_csv_sub_gen(model_size, subclass_choice)
     return leaderboard_table
-def get_ChineseGuardBench(
-    model_size: List[str],
-    main_choice: List[str],
-):
-    leaderboard_table = get_dataset_new_csv(model_size)
-    return leaderboard_table
 def get_dataset_classfier_per(
     model_size: List[str],
     main_choice: List[str],
@@ -187,10 +164,10 @@ with gr.Blocks() as demo:
     with gr.Row():
         gr.Markdown(METRICS_TEXT, elem_classes="markdown-text")
     with gr.Row():
         gr.Markdown(EVALUTION_TEXT, elem_classes="markdown-text")
     with gr.Row():
         with gr.Column(scale=0.8):
             main_choice = gr.Dropdown(
@@ -199,8 +176,8 @@ with gr.Blocks() as demo:
                 label="Type",
                 info="Please choose the type to display.",
             )
-        with gr.Column(scale=10):
             model_choice = gr.CheckboxGroup(
                 choices=CLASSIFICATION["model_size"],
                 value=CLASSIFICATION["model_size"],  # all be choosed
@@ -211,29 +188,24 @@ with gr.Blocks() as demo:
     #👉 this part is for csv table generatived
     with gr.Tabs(elem_classes="tab-buttons") as tabs:
         # ----------------- modify text -----------------
-        with gr.TabItem("🏅 Generation", elem_id="od-benchmark-tab-table", id=5):
             dataframe_all_gen = gr.components.Dataframe(
                 elem_id="leaderboard-table",
             )
-        with gr.TabItem("🏅 Perplexity", elem_id="od-benchmark-tab-table", id=6):
-            dataframe_all_per = gr.components.Dataframe(
-                elem_id="leaderboard-table",
-            )
-        with gr.TabItem("🏅 NEW", elem_id="od-benchmark-tab-table", id=7):
-            dataframe_all_guardbench = gr.components.Dataframe(
                 elem_id="leaderboard-table",
             )
     # ----------------- modify text -----------------
     with gr.Row():
         gr.Markdown(ACKNOWLEDGEMENTS_TEXT, elem_classes="markdown-text")
     with gr.Row():
         gr.Markdown(REFERENCE_TEXT, elem_classes="markdown-text")
     # 👉 this part is for citation
     with gr.Row():
         with gr.Accordion("📙 Citation", open=True):
@@ -244,18 +216,18 @@ with gr.Blocks() as demo:
                 elem_id="citation-button",
                 show_copy_button=True
             )
     gr.Markdown(f"Last updated on **{_LAST_UPDATED}**", elem_classes="markdown-text")
     # --------------------------- all --------------------------------
     # this is  all result Perplexity
     main_choice.change(
         get_dataset_classfier_per,
         inputs=[model_choice, main_choice],
         outputs=dataframe_all_per,
     )
     model_choice.change(
         get_dataset_classfier_per,
         inputs=[model_choice, main_choice],
@@ -267,45 +239,26 @@ with gr.Blocks() as demo:
         inputs=[model_choice, main_choice],
         outputs=dataframe_all_per,
     )
     # this is all result generatived
     main_choice.change(
         get_dataset_classfier_gen,
         inputs=[model_choice, main_choice],
         outputs=dataframe_all_gen,
     )
     model_choice.change(
         get_dataset_classfier_gen,
         inputs=[model_choice, main_choice],
         outputs=dataframe_all_gen,
     )
     demo.load(
         fn=get_dataset_classfier_gen,
         inputs=[model_choice, main_choice],
         outputs=dataframe_all_gen,
     )
-    # this is new results for ChineseGuardBench
-    # main_choice.change(
-    #     get_ChineseGuardBench,
-    #     inputs=[model_choice, main_choice],
-    #     outputs=dataframe_all_guardbench,
-    # )
-    model_choice.change(
-        get_ChineseGuardBench,
-        inputs=[model_choice, main_choice],
-        outputs=dataframe_all_guardbench,
-    )
-    demo.load(
-        fn=get_ChineseGuardBench,
-        inputs=[model_choice, main_choice],
-        outputs=dataframe_all_guardbench,
-    )
 demo.launch(share=True)

 from assets.text import INTRODUCTION_TEXT, METRICS_TEXT, EVALUTION_TEXT, ACKNOWLEDGEMENTS_TEXT, REFERENCE_TEXT
+ORIGINAL_DF = pd.read_csv("./data/chinese_benchmark_gen.csv", sep='\t') # space separated values
+ORIGINAL_DF_PER = pd.read_csv("./data/chinese_benchmark_per.csv", sep='\t') #
+ORIGINAL_DF_SUB_GEN = pd.read_csv("./data/subclass_gen.csv", sep=',') #
+ORIGINAL_DF_SUB_PER = pd.read_csv("./data/subclass_per.csv", sep=',')
 METRICS = ["Accuracy", "Precision_Unsafe", "Recall_Unsafe", "Precision_Safe", "Recall_Safe", "None"]
 SUBCLASS = ["Discrimination", "Variant", "Psychology", "Politics", "Eroticism", "Vulgarity", "Property", "Injury", "Criminality", "Ethics"]
 #SPLITS = ["Overall", "Subclass"]
         "~30B",
         "10B~20B",
         "5B~10B",
         "API",
     ]
 }
 _BIBTEX = """
 @misc{zhang2024chinesesafechinesebenchmarkevaluating,
+      title={ChineseSafe: A Chinese Benchmark for Evaluating Safety in Large Language Models},
       author={Hengxiang Zhang and Hongfu Gao and Qiang Hu and Guanhua Chen and Lili Yang and Bingyi Jing and Hongxin Wei and Bing Wang and Haifeng Bai and Lei Yang},
       year={2024},
       eprint={2410.18491},
       archivePrefix={arXiv},
       primaryClass={cs.CL},
+      url={https://arxiv.org/abs/2410.18491},
 }
 """
+_LAST_UPDATED = "April 13, 2025"
 banner_url = "./assets/logo.png"
 _BANNER = f'<div style="display: flex; justify-content: space-around;"><img src="{banner_url}" alt="Banner" style="width: 40vw; min-width: 300px; max-width: 600px;"> </div>'  # noqa
 def format_csv_numbers_second(text):
     return text.split()
 def format_number(x):
     return float(f"{x:.3}")
 def get_dataset_csv(
     model_size: List[str],
 ):
     df = ORIGINAL_DF[ORIGINAL_DF['Size'].isin(model_size)]
     df = df.drop(columns="Size")
     leaderboard_table = gr.components.Dataframe(
         value=df,
         interactive=False,
 ):
     df = ORIGINAL_DF_SUB_GEN[ORIGINAL_DF_SUB_GEN['Size'].isin(model_size)]
     df = df.drop(columns="Size")
     # get subclass
     subclass_choice_label = ["Model", subclass_choice+"_Accuracy", subclass_choice+"_Precision", subclass_choice+"_Recall"]
     df = df[subclass_choice_label]
     leaderboard_table = gr.components.Dataframe(
         value=df,
         interactive=False,
 ):
     df = ORIGINAL_DF_SUB_PER[ORIGINAL_DF_SUB_PER['Size'].isin(model_size)]
     df = df.drop(columns="Size")
     # get subclass
     subclass_choice_label = ["Model", subclass_choice+"_Accuracy", subclass_choice+"_Precision", subclass_choice+"_Recall"]
     df = df[subclass_choice_label]
     leaderboard_table = gr.components.Dataframe(
         value=df,
         interactive=False,
         subclass_choice = main_choice
         leaderboard_table = get_dataset_csv_sub_gen(model_size, subclass_choice)
     return leaderboard_table
 def get_dataset_classfier_per(
     model_size: List[str],
     main_choice: List[str],
     with gr.Row():
         gr.Markdown(METRICS_TEXT, elem_classes="markdown-text")
     with gr.Row():
         gr.Markdown(EVALUTION_TEXT, elem_classes="markdown-text")
     with gr.Row():
         with gr.Column(scale=0.8):
             main_choice = gr.Dropdown(
                 label="Type",
                 info="Please choose the type to display.",
             )
+        with gr.Column(scale=10):
             model_choice = gr.CheckboxGroup(
                 choices=CLASSIFICATION["model_size"],
                 value=CLASSIFICATION["model_size"],  # all be choosed
     #👉 this part is for csv table generatived
     with gr.Tabs(elem_classes="tab-buttons") as tabs:
         # ----------------- modify text -----------------
+        with gr.TabItem("🏅 Generation", elem_id="od-benchmark-tab-table", id=6):
             dataframe_all_gen = gr.components.Dataframe(
                 elem_id="leaderboard-table",
             )
+        with gr.TabItem("🏅 Perplexity", elem_id="od-benchmark-tab-table", id=5):
+            dataframe_all_per = gr.components.Dataframe(
                 elem_id="leaderboard-table",
             )
     # ----------------- modify text -----------------
     with gr.Row():
         gr.Markdown(ACKNOWLEDGEMENTS_TEXT, elem_classes="markdown-text")
     with gr.Row():
         gr.Markdown(REFERENCE_TEXT, elem_classes="markdown-text")
     # 👉 this part is for citation
     with gr.Row():
         with gr.Accordion("📙 Citation", open=True):
                 elem_id="citation-button",
                 show_copy_button=True
             )
     gr.Markdown(f"Last updated on **{_LAST_UPDATED}**", elem_classes="markdown-text")
     # --------------------------- all --------------------------------
     # this is  all result Perplexity
     main_choice.change(
         get_dataset_classfier_per,
         inputs=[model_choice, main_choice],
         outputs=dataframe_all_per,
     )
     model_choice.change(
         get_dataset_classfier_per,
         inputs=[model_choice, main_choice],
         inputs=[model_choice, main_choice],
         outputs=dataframe_all_per,
     )
     # this is all result generatived
     main_choice.change(
         get_dataset_classfier_gen,
         inputs=[model_choice, main_choice],
         outputs=dataframe_all_gen,
     )
     model_choice.change(
         get_dataset_classfier_gen,
         inputs=[model_choice, main_choice],
         outputs=dataframe_all_gen,
     )
     demo.load(
         fn=get_dataset_classfier_gen,
         inputs=[model_choice, main_choice],
         outputs=dataframe_all_gen,
     )
 demo.launch(share=True)

assets/text.py CHANGED Viewed

@@ -34,13 +34,13 @@ EVALUTION_TEXT= """
 <span style="font-size:16px; font-family: 'Times New Roman', serif">
 We evaluate the models using two methods: perplexity(multiple choice) and generation.
 For perplexity, we select the label which is the lowest perplexity as the predicted results.
-For generation, we use the content generated by the model to make prediction.
-In the "New" table, we present evaluation results for generation on a meticulously curated new benchmark, with details of its processing to be introduced later.
-The following are the results of the evaluation.👇👇👇
 </span> <br><br>
 """ # noqa
 REFERENCE_TEXT = """
 # References
 <span style="font-size:16px; font-family: 'Times New Roman', serif">

 <span style="font-size:16px; font-family: 'Times New Roman', serif">
 We evaluate the models using two methods: perplexity(multiple choice) and generation.
 For perplexity, we select the label which is the lowest perplexity as the predicted results.
+For generation, we use the content generated by the model to make prediction.
+The following are the results of the evaluation. 👇👇👇
 </span> <br><br>
 """ # noqa
 REFERENCE_TEXT = """
 # References
 <span style="font-size:16px; font-family: 'Times New Roman', serif">

changelog.md CHANGED Viewed

@@ -1,6 +1,5 @@
 # CHANGELOG
 ### 2024-7-16
 version: v1.0.0
@@ -53,28 +52,4 @@ version: v1.0.5
         - DeepSeek-R1-Distill-Llama-70B
         - Mistral-Small-24B-Instruct-2501
         - Moonlight-16B-A3B-Instruct
-    - [2]feat: release a test set of 20000 samples
-### 2025-7-1
-version: v1.0.6
-    changed:
-    - [1]feat: update many models due to the April's todo-list:
-        - Llama-4-maverick
-        - Gemini-2.5-flash-preview-05-20
-        - Deepseek-chat-v3-0324
-        - Qwen3
-        - Gemma-3
-        - OpenThinker2
-### 2025-7-29
-version: v1.0.7
-    changed:
-    - [1]feat: Update the two models required by Deepexi.
-        - Deepexi-Guard-3B
-        - Qwen2.5-3B-Instruct
-    - [2]feat: Update a new table ChineseGuardBench required by Deepxi.

 # CHANGELOG
 ### 2024-7-16
 version: v1.0.0
         - DeepSeek-R1-Distill-Llama-70B
         - Mistral-Small-24B-Instruct-2501
         - Moonlight-16B-A3B-Instruct
+    - [2]feat: release a test set of 20000 samples

data/ChineseGuardBench.csv DELETED Viewed

@@ -1,33 +0,0 @@
-Model,Size,F1,Accuracy,Precision,Recall,FPR,FNR
-Deepexi-Guard-3B,1B~5B,89.63 ,89.72 ,85.53 ,94.15 ,14.24 ,5.85
-Qwen3-32B,~30B,88.54 ,89.25 ,89.08 ,88.02 ,9.64 ,11.98
-Qwen3-235B-A22B,>65B,87.92 ,88.96 ,90.86 ,85.17 ,7.66 ,14.83
-Qwen3-235B-A22B-Instruct-2507,>65B,87.81 ,89.13 ,93.27 ,82.96 ,5.35 ,17.04
-GLM-Z1-9B-0414,5B~10B,87.36 ,88.03 ,87.11 ,87.61 ,11.59 ,12.39
-Qwen2.5-72B-Instruct,>65B,86.81 ,88.27 ,92.50 ,81.79 ,5.93 ,18.21
-QwQ-32B,~30B,86.80 ,88.35 ,93.33 ,81.12 ,5.18 ,18.88
-Phi-4,10B~20B,85.95 ,86.88 ,86.90 ,85.02 ,11.45 ,14.98
-Gemma-3-27B-it,~30B,85.29 ,86.78 ,89.83 ,81.19 ,8.22 ,18.81
-DeepSeek-R1-0528,>65B,85.24 ,87.47 ,96.02 ,76.63 ,2.84 ,23.37
-Mistral-Small-3.2-24B-Instruct,~30B,85.07 ,87.03 ,93.14 ,78.29 ,5.15 ,21.71
-GLM-4-9B-chat,5B~10B,84.85 ,86.27 ,88.47 ,81.52 ,9.49 ,18.48
-MD-Judge-v0_2-internlm2_7B,5B~10B,84.63 ,85.88 ,87.03 ,82.37 ,10.98 ,17.63
-DeepSeek-R1-Distill-Qwen-32B,~30B,84.55 ,86.64 ,93.05 ,77.47 ,5.17 ,22.53
-Hunyuan-A13B-Instruct,>65B,84.32 ,86.21 ,90.97 ,78.58 ,6.98 ,21.42
-Moonlight-16B-A3B-Instruct,10B~20B,84.21 ,84.35 ,80.41 ,88.38 ,19.25 ,11.62
-GLM-Z1-32B-0414,~30B,83.40 ,85.75 ,92.63 ,75.85 ,5.40 ,24.15
-Qwen3-8B,5B~10B,83.05 ,85.51 ,92.69 ,75.23 ,5.30 ,24.77
-Qwen2.5-7B-Instruct,5B~10B,82.96 ,84.99 ,89.41 ,77.37 ,8.20 ,22.63
-Qwen2.5-1.5B-Instruct,1B~5B,79.48 ,77.08 ,68.83 ,94.03 ,38.07 ,5.97
-shieldgemma-2B,1B~5B,79.19 ,79.63 ,76.50 ,82.06 ,22.54 ,17.94
-Qwen2.5-3B-Instruct,1B~5B,79.05 ,77.57 ,70.69 ,89.66 ,33.25 ,10.34
-SHTEC_safety_fence_model_7B,5B~10B,78.44 ,82.48 ,93.54 ,67.54 ,4.17 ,32.46
-Qwen3-4B,1B~5B,78.16 ,82.50 ,95.12 ,66.33 ,3.04 ,33.67
-SmolLM3-3B,1B~5B,76.10 ,79.19 ,83.09 ,70.19 ,12.77 ,29.81
-ERNIE-4.5-21B-A3B-Paddle,~20B,75.21 ,80.58 ,94.58 ,62.42 ,3.20 ,37.58
-Qwen3-1.7B,1B~5B,74.46 ,79.34 ,89.36 ,63.82 ,6.79 ,36.18
-internlm2_5-7B-chat,5B~10B,71.52 ,78.49 ,95.34 ,57.22 ,2.50 ,42.78
-Llama-Guard-4-12B,10B~20B,65.66 ,74.64 ,90.99 ,51.36 ,4.54 ,48.64
-Llama-Guard-3-8B,5B~10B,59.33 ,72.44 ,97.80 ,42.58 ,0.86 ,57.42
-DeepSeek-R1-Distill-Qwen-7B,5B~10B,45.27 ,65.53 ,90.36 ,30.20 ,2.88 ,69.80
-Gemma-3n-E4B-it,5B~10B,44.05 ,64.88 ,88.80 ,29.29 ,3.30 ,70.71

data/chinese_benchmark_gen.csv CHANGED Viewed

@@ -1,56 +1,43 @@
-Model,Size,Accuracy/std,Precision_Unsafe/std,Recall_Unsafe/std,Precision_Safe/std,Recall_Safe/std
-GPT-4o,API,73.78/0.30,97.75/0.13,48.66/0.04,65.84/0.55,98.88/0.04
-GPT-4-Turbo,API,71.67/0.17,80.13/0.64,57.59/0.69,66.93/0.44,85.74/0.35
-Pespective,API,69.28/0.32,69.96/0.79,67.49/0.32,68.64/0.32,71.06/0.43
-GPT-3.5,API,64.70/0.44,76.12/0.55,42.79/0.64,60.24/0.76,86.59/0.32
-Gemini-2.5-flash-preview-05-20,API,71.27/0.27,73.40/0.23,70.16/0.71,69.17/0.53,72.48/0.40
-Llama-4-maverick,API,75.02/0.03,62.35/0.10,83.53/0.03,87.71/0.04,69.96/0.04
-Gemini-2.0-flash-001,API,52.04/0.61,0.95/0.05,69.46/0.38,99.60/0.03,51.93/0.62
-Deepseek-chat-v3-0324,API,66.00/0.11,45.08/0.11,77.52/0.19,86.93/0.11,61.28/0.08
-Deepexi-Guard-3B,1B~5B,78.26/0.0,89.35/0.0,64.16/0.0,72.04/0.0,92.35/0.0
-Qwen2.5-3B-Instruct,1B~5B,71.81/0.0,70.36/0.0,75.36/0.0,73.47/0.0,68.25/0.0
-Phi-3-small-8k-instruct,5B~10B,72.73/0.47,73.67/0.63,71.12/0.49,71.85/0.35,74.36/0.59
-Gemma-1.1-7B-it,5B~10B,71.70/0.26,68.66/0.37,80.11/0.05,76.00/0.09,63.26/0.47
-DeepSeek-LLM-7B-Chat,5B~10B,71.63/0.17,69.50/0.15,77.33/0.67,74.33/0.41,65.90/0.38
-GLM-4-9B-Chat,5B~10B,70.96/0.23,82.15/0.55,53.73/0.48,65.50/0.18,88.27/0.41
-Mistral-7B-Instruct-v0.3,5B~10B,70.41/0.41,68.55/0.52,75.67/0.22,72.71/0.26,65.12/0.58
-Qwen1.5-7B-Chat,5B~10B,70.36/0.39,64.66/0.27,90.09/0.57,83.55/0.82,50.53/0.18
-Phi-3-small-128k-instruct,5B~10B,67.43/0.26,72.10/0.54,57.35/0.17,64.33/0.09,77.61/0.43
-Ministral-8B-Instruct-2410,5B~10B,62.32/0.01,62.71/0.19,61.60/0.29,61.94/0.19,63.05/0.28
-Yi-1.5-9B-Chat,5B~10B,62.12/0.38,64.42/0.42,54.53/0.43,60.43/0.36,69.75/0.37
-Llama3-ChatQA-1.5-8B,5B~10B,61.28/0.40,57.63/0.20,85.84/0.43,72.02/0.95,36.61/0.54
-Baichuan2-7B-Chat,5B~10B,59.43/0.24,72.06/0.66,31.11/0.40,55.95/0.12,87.89/0.20
-InternLM2-chat-7B,5B~10B,58.79/0.09,62.70/0.19,43.88/0.17,56.68/0.14,73.77/0.13
-GPT-J-6B,5B~10B,52.65/0.32,52.42/0.32,62.00/0.42,52.99/0.37,43.21/0.92
-Opt-6.7B,5B~10B,50.00/0.11,50.17/0.17,64.70/0.35,49.69/0.04,35.18/0.44
-Qwen3-4B,5B~10B,74.95/0.01,76.47/0.01,72.10/0.00,73.61/0.01,77.81/0.01
-Gemma-3-4B-it,5B~10B,71.41/0.00,66.54/0.00,86.12/0.00,80.33/0.00,56.70/0.00
-phi-4,10B~20B,72.24/0.24,76.59/0.46,64.42/0.51,69.06/0.15,80.13/0.62
-InternLM2-Chat-20B,10B~20B,70.21/0.55,73.30/0.70,63.79/0.43,67.82/0.45,76.65/0.67
-Qwen1.5-14B-Chat,10B~20B,68.25/0.44,65.87/0.37,76.02/0.72,71.51/0.59,60.44/0.20
-Phi-3-medium-128k-instruct,10B~20B,64.30/0.06,63.89/0.13,66.53/0.52,64.76/0.26,62.05/0.42
-Baichuan2-13B-Chat,10B~20B,62.86/0.31,64.17/0.33,58.61/0.80,61.75/0.30,67.13/0.56
-Mistral-Nemo-Instruct-2407,10B~20B,59.71/0.45,61.79/0.52,51.82/0.48,58.20/0.44,67.68/0.44
-Phi-3-medium-4k-instruct,10B~20B,57.79/0.45,58.69/0.37,53.88/0.62,57.02/0.55,61.74/0.55
-Ziya2-13B-Chat,10B~20B,53.40/0.43,53.33/0.38,56.18/0.41,53.48/0.53,50.62/0.61
-Opt-13B,10B~20B,50.18/0.26,50.29/0.20,69.97/0.37,49.94/0.47,30.22/0.31
-Moonlight-16B-A3B-Instruct,10B~20B,45.16/0.43,44.16/0.64,34.79/0.67,45.82/0.33,55.62/0.35
-Qwen3-14B,10B~20B,68.54/0.01,67.24/0.01,72.29/0.00,70.04/0.00,64.78/0.01
-Gemma-3-12B-it,10B~20B,65.63/0.00,62.69/0.00,77.18/0.00,70.32/0.00,54.07/0.00
-DeepSeek-LLM-67B-Chat,>65B,76.76/0.35,73.40/0.37,84.26/0.40,81.34/0.35,69.19/0.64
-Llama3-ChatQA-1.5-70B,>65B,65.29/0.29,66.24/0.50,62.92/0.12,64.43/0.19,67.69/0.63
-Qwen2.5-72B-Instruct,>65B,63.41/0.77,66.00/0.95,56.00/0.62,61.49/0.65,70.90/0.96
-Qwen1.5-72B-Chat,>65B,62.91/0.50,73.86/0.84,40.46/0.97,58.75/0.35,85.55/0.62
-Opt-66B,>65B,54.46/0.17,53.22/0.06,76.94/0.24,57.73/0.49,31.77/0.28
-Qwen2-72B-Instruct,>65B,54.08/0.20,58.10/0.60,30.72/0.45,52.63/0.05,77.65/0.36
-DeepSeek-R1-Distill-Llama-70B,>65B,52.93/0.18,59.69/0.47,19.33/0.38,51.62/0.16,86.83/0.18
-Llama-3.1-70B-Instruct,>65B,52.84/0.38,59.07/1.22,19.82/0.85,51.57/0.24,86.14/0.58
-Llama-3.3-70B-Instruct,>65B,50.87/0.07,54.51/0.86,13.19/0.10,50.37/0.06,88.89/0.39
-Qwen3-32B,>65B,75.26/0.00,89.11/0.00,57.55/0.0,68.65/0.00,92.97/0.00
-Qwen2.5-32B-Instruct,~30B,69.64/0.39,92.13/0.45,43.24/0.83,62.70/0.25,96.27/0.20
-QwQ-32B-Preview,~30B,69.55/0.28,75.97/0.48,57.60/0.27,65.61/0.17,81.62/0.33
-Mistral-Small-24B-Instruct-2501,~30B,64.48/0.17,64.61/0.35,64.71/0.72,64.34/0.00,64.23/1.04
-Yi-1.5-34B-Chat,~30B,60.06/0.43,58.14/0.40,72.51/0.55,63.27/0.56,47.56/0.42
-Opt-30B,~30B,50.88/0.11,50.76/0.12,72.95/0.16,51.18/0.26,28.62/0.28
-Gemma-3-27B-it,~30B,68.50/0.00,68.37/0.00,68.84/0.00,68.62/0.00,68.15/0.00
-OpenThinker2-32B,~30B,65.01/0.01,74.90/0.01,45.13/0.01,60.74/0.01,84.87/0.00

+Model	Size	Accuracy/std	Precision_Unsafe/std	Recall_Unsafe/std	Precision_Safe/std	Recall_Safe/std
+DeepSeek-LLM-67B-Chat	>65B	76.76/0.35	73.40/0.37	84.26/0.40	81.34/0.35	69.19/0.64
+Llama3-ChatQA-1.5-70B	>65B	65.29/0.29	66.24/0.50	62.92/0.12	64.43/0.19	67.69/0.63
+Qwen2.5-72B-Instruct	>65B	63.41/0.77	66.00/0.95	56.00/0.62	61.49/0.65	70.90/0.96
+Qwen1.5-72B-Chat	>65B	62.91/0.50	73.86/0.84	40.46/0.97	58.75/0.35	85.55/0.62
+Opt-66B	>65B	54.46/0.17	53.22/0.06	76.94/0.24	57.73/0.49	31.77/0.28
+Qwen2-72B-Instruct	>65B	54.08/0.20	58.10/0.60	30.72/0.45	52.63/0.05	77.65/0.36
+DeepSeek-R1-Distill-Llama-70B	>65B	52.93/0.18	59.69/0.47	19.33/0.38	51.62/0.16	86.83/0.18
+Llama-3.1-70B-Instruct	>65B	52.84/0.38	59.07/1.22	19.82/0.85	51.57/0.24	86.14/0.58
+Llama-3.3-70B-Instruct	>65B	50.87/0.07	54.51/0.86	13.19/0.10	50.37/0.06	88.89/0.39
+Qwen2.5-32B-Instruct	~30B	69.64/0.39	92.13/0.45	43.24/0.83	62.70/0.25	96.27/0.20
+QwQ-32B-Preview	~30B	69.55/0.28	75.97/0.48	57.60/0.27	65.61/0.17	81.62/0.33
+Mistral-Small-24B-Instruct-2501	~30B	64.48/0.17	64.61/0.35	64.71/0.72	64.34/0.00	64.23/1.04
+Yi-1.5-34B-Chat	~30B	60.06/0.43	58.14/0.40	72.51/0.55	63.27/0.56	47.56/0.42
+Opt-30B	~30B	50.88/0.11	50.76/0.12	72.95/0.16	51.18/0.26	28.62/0.28
+phi-4	10B~20B	72.24/0.24	76.59/0.46	64.42/0.51	69.06/0.15	80.13/0.62
+InternLM2-Chat-20B	10B~20B	70.21/0.55	73.30/0.70	63.79/0.43	67.82/0.45	76.65/0.67
+Qwen1.5-14B-Chat	10B~20B	68.25/0.44	65.87/0.37	76.02/0.72	71.51/0.59	60.44/0.20
+Phi-3-medium-128k-instruct	10B~20B	64.30/0.06	63.89/0.13	66.53/0.52	64.76/0.26	62.05/0.42
+Baichuan2-13B-Chat	10B~20B	62.86/0.31	64.17/0.33	58.61/0.80	61.75/0.30	67.13/0.56
+Mistral-Nemo-Instruct-2407	10B~20B	59.71/0.45	61.79/0.52	51.82/0.48	58.20/0.44	67.68/0.44
+Phi-3-medium-4k-instruct	10B~20B	57.79/0.45	58.69/0.37	53.88/0.62	57.02/0.55	61.74/0.55
+Ziya2-13B-Chat	10B~20B	53.40/0.43	53.33/0.38	56.18/0.41	53.48/0.53	50.62/0.61
+Opt-13B	10B~20B	50.18/0.26	50.29/0.20	69.97/0.37	49.94/0.47	30.22/0.31
+Moonlight-16B-A3B-Instruct	10B~20B	45.16/0.43	44.16/0.64	34.79/0.67	45.82/0.33	55.62/0.35
+Phi-3-small-8k-instruct	5B~10B	72.73/0.47	73.67/0.63	71.12/0.49	71.85/0.35	74.36/0.59
+Gemma-1.1-7B-it	5B~10B	71.70/0.26	68.66/0.37	80.11/0.05	76.00/0.09	63.26/0.47
+DeepSeek-LLM-7B-Chat	5B~10B	71.63/0.17	69.50/0.15	77.33/0.67	74.33/0.41	65.90/0.38
+GLM-4-9B-Chat	5B~10B	70.96/0.23	82.15/0.55	53.73/0.48	65.50/0.18	88.27/0.41
+Mistral-7B-Instruct-v0.3	5B~10B	70.41/0.41	68.55/0.52	75.67/0.22	72.71/0.26	65.12/0.58
+Qwen1.5-7B-Chat	5B~10B	70.36/0.39	64.66/0.27	90.09/0.57	83.55/0.82	50.53/0.18
+Phi-3-small-128k-instruct	5B~10B	67.43/0.26	72.10/0.54	57.35/0.17	64.33/0.09	77.61/0.43
+Ministral-8B-Instruct-2410	5B~10B	62.32/0.01	62.71/0.19	61.60/0.29	61.94/0.19	63.05/0.28
+Yi-1.5-9B-Chat	5B~10B	62.12/0.38	64.42/0.42	54.53/0.43	60.43/0.36	69.75/0.37
+Llama3-ChatQA-1.5-8B	5B~10B	61.28/0.40	57.63/0.20	85.84/0.43	72.02/0.95	36.61/0.54
+Baichuan2-7B-Chat	5B~10B	59.43/0.24	72.06/0.66	31.11/0.40	55.95/0.12	87.89/0.20
+InternLM2-chat-7B	5B~10B	58.79/0.09	62.70/0.19	43.88/0.17	56.68/0.14	73.77/0.13
+GPT-J-6B	5B~10B	52.65/0.32	52.42/0.32	62.00/0.42	52.99/0.37	43.21/0.92
+Opt-6.7B	5B~10B	50.00/0.11	50.17/0.17	64.70/0.35	49.69/0.04	35.18/0.44
+GPT-4o	API	73.78/0.30	97.75/0.13	48.66/0.04	65.84/0.55	98.88/0.04
+GPT-4-Turbo	API	71.67/0.17	80.13/0.64	57.59/0.69	66.93/0.44	85.74/0.35
+Pespective	API	69.28/0.32	69.96/0.79	67.49/0.32	68.64/0.32	71.06/0.43
+GPT-3.5	API	64.70/0.44	76.12/0.55	42.79/0.64	60.24/0.76	86.59/0.32

data/chinese_benchmark_per.csv CHANGED Viewed

@@ -1,46 +1,39 @@
-Model,Size,Accuracy/std,Precision_Unsafe/std,Recall_Unsafe/std,Precision_Safe/std,Recall_Safe/std
-Yi-1.5-34B-Chat,~30B,66.02/0.22,80.13/0.55,42.82/0.25,60.86/0.16,89.33/0.41
-Qwen2.5-32B-Instruct,~30B,64.33/0.46,62.46/0.44,72.24/0.71,66.91/0.53,56.38/0.18
-Opt-30B,~30B,53.82/0.03,54.42/0.21,48.32/0.20,53.34/0.11,59.34/0.27
-QwQ-32B-Preview,~30B,51.82/0.06,51.04/0.10,94.83/0.28,62.38/0.26,8.61/0.39
-Gemma-3-27B-it,~30B,50.00/0.00,0.00/0.00,0.00/0.00,50.00/0.00,100.00/0.00
-Qwen3-32B,~30B,49.66/0.00,49.83/0.00,99.03/0.00,22.40/0.00,0.28/0.00
-OpenThinker2-32B,~30B,49.91/0.00,49.95/0.00,98.26/0.00,47.27/0.00,1.56/0.00
-DeepSeek-LLM-67B-Chat,>65B,68.08/0.35,94.80/0.83,38.40/0.43,61.27/0.26,97.88/0.36
-Qwen1.5-72B-Chat,>65B,63.67/0.46,58.27/0.32,96.84/0.13,90.51/0.57,30.34/0.80
-Qwen2.5-72B-Instruct,>65B,63.27/0.52,66.00/0.60,55.09/0.82,61.31/0.46,71.49/0.25
-Qwen2-72B-Instruct,>65B,60.70/0.49,57.90/0.42,79.03/0.63,66.75/0.77,42.28/0.43
-Opt-66B,>65B,59.93/0.41,56.52/0.37,86.87/0.59,71.36/0.78,32.86/0.74
-DeepSeek-R1-Distill-Llama-70B,>65B,47.68/0.64,45.77/1.21,23.85/0.67,48.35/0.46,71.62/0.60
-Llama-3.1-70B-Instruct,>65B,43.68/0.41,36.45/0.84,16.66/0.34,45.83/0.30,70.82/0.48
-Llama3-ChatQA-1.5-70B,>65B,40.41/0.29,33.86/0.75,19.84/0.75,43.13/0.25,61.08/0.37
-Llama-3.3-70B-Instruct,>65B,36.84/0.82,32.02/1.29,23.19/1.13,39.58/0.63,50.55/0.69
-Phi-3-medium-4k-instruct,10B~20B,71.04/0.31,69.74/0.29,74.56/0.97,72.54/0.59,67.49/0.89
-Baichuan2-13B-Chat,10B~20B,70.43/0.39,65.81/0.38,85.34/0.63,79.02/0.63,55.46/0.47
-Phi-3-medium-128k-instruct,10B~20B,68.87/0.81,68.08/0.51,71.32/1.44,69.75/1.17,66.41/0.57
-Mistral-Nemo-Instruct-2407,10B~20B,66.88/0.46,62.56/0.28,84.42/0.90,75.89/1.13,49.26/0.24
-phi-4,10B~20B,62.62/0.32,63.73/0.41,58.98/0.20,61.66/0.31,66.28/0.78
-Qwen1.5-14B-Chat,10B~20B,61.29/0.40,57.02/0.32,92.43/0.55,79.80/1.05,30.02/0.47
-Mistral-Small-24B-Instruct-2501,10B~20B,59.20/0.46,58.32/0.42,65.16/1.08,60.33/0.56,53.22/0.20
-Ziya2-13B-Chat,10B~20B,55.25/0.26,59.24/0.37,34.30/0.11,53.61/0.26,76.29/0.39
-InternLM2-Chat-20B,10B~20B,53.67/0.16,79.00/0.66,10.30/0.60,51.90/0.11,97.25/0.26
-Opt-13B,10B~20B,49.31/0.31,37.77/3.57,1.76/0.16,49.59/0.23,97.08/0.29
-Moonlight-16B-A3B-Instruct,10B~20B,48.92/0.16,3.46/0.57,0.07/0.01,49.40/0.15,98.00/0.08
-Qwen3-14B,10B~20B,48.34/0.00,49.14/0.00,95.13/0.00,24.26/0.00,1.56/0.00
-Gemma-3-12B-it,10B~20B,50.00/0.00,0.00/0.00,0.00/0.00,50.00/0.00,100.00/0.00
-Gemma-1.1-7B-it,5B~10B,64.32/0.68,59.98/0.58,86.60/0.35,75.70/0.80,41.95/0.93
-Qwen1.5-7B-Chat,5B~10B,62.48/0.54,59.06/0.48,81.92/0.50,70.28/0.65,42.96/0.81
-Phi-3-small-128k-instruct,5B~10B,61.76/0.27,60.47/0.16,68.45/0.61,63.46/0.50,55.05/0.61
-Yi-1.5-9B-Chat,5B~10B,60.35/0.52,79.47/1.37,28.16/0.33,56.22/0.39,92.69/0.59
-Phi-3-small-8k-instruct,5B~10B,59.47/0.39,56.25/0.30,86.06/0.40,70.05/0.85,32.75/0.49
-DeepSeek-LLM-7B-Chat,5B~10B,56.79/0.19,84.83/1.23,16.77/0.09,53.70/0.15,96.99/0.27
-Ministral-8B-Instruct-2410,5B~10B,56.28/0.51,55.10/0.51,68.83/0.58,58.24/0.51,43.66/0.54
-GPT-J-6B,5B~10B,55.98/0.42,80.27/1.42,16.11/0.86,53.26/0.23,96.03/0.20
-Baichuan2-7B-Chat,5B~10B,53.99/0.51,62.89/1.57,19.96/0.88,52.31/0.30,88.18/0.23
-GLM-4-9B-Chat,5B~10B,50.03/0.15,50.07/0.13,99.31/0.22,44.12/9.01,0.52/0.04
-InternLM2-Chat-7B,5B~10B,49.49/0.11,42.16/1.58,2.15/0.31,49.68/0.13,97.06/0.25
-Opt-6.7B,5B~10B,48.54/0.43,49.24/0.31,86.62/1.03,43.40/1.18,10.30/0.55
-Mistral-7B-Instruct-v0.3,5B~10B,42.99/0.06,39.54/0.47,26.01/0.69,44.69/0.11,60.05/0.50
-Llama3-ChatQA-1.5-8B,5B~10B,42.11/0.29,37.46/0.85,23.20/0.89,44.20/0.09,61.11/0.57
-Qwen3-4B,5B~10B,46.04/0.00,47.79/0.00,85.94/0.00,30.39/0.00,6.14/0.00
-Gemma-3-4B-it,5B~10B,50.00/0.00,0.00/0.00,0.00/0.00,50.00/0.00,100.00/0.00

+Model	Size	Accuracy/std	Precision_Unsafe/std	Recall_Unsafe/std	Precision_Safe/std	Recall_Safe/std
+DeepSeek-LLM-67B-Chat	>65B	68.08/0.35	94.80/0.83	38.40/0.43	61.27/0.26	97.88/0.36
+Qwen1.5-72B-Chat	>65B	63.67/0.46	58.27/0.32	96.84/0.13	90.51/0.57	30.34/0.80
+Qwen2.5-72B-Instruct	>65B	63.27/0.52	66.00/0.60	55.09/0.82	61.31/0.46	71.49/0.25
+Qwen2-72B-Instruct	>65B	60.70/0.49	57.90/0.42	79.03/0.63	66.75/0.77	42.28/0.43
+Opt-66B	>65B	59.93/0.41	56.52/0.37	86.87/0.59	71.36/0.78	32.86/0.74
+DeepSeek-R1-Distill-Llama-70B	>65B	47.68/0.64	45.77/1.21	23.85/0.67	48.35/0.46	71.62/0.60
+Llama-3.1-70B-Instruct	>65B	43.68/0.41	36.45/0.84	16.66/0.34	45.83/0.30	70.82/0.48
+Llama3-ChatQA-1.5-70B	>65B	40.41/0.29	33.86/0.75	19.84/0.75	43.13/0.25	61.08/0.37
+Llama-3.3-70B-Instruct	>65B	36.84/0.82	32.02/1.29	23.19/1.13	39.58/0.63	50.55/0.69
+Yi-1.5-34B-Chat	~30B	66.02/0.22	80.13/0.55	42.82/0.25	60.86/0.16	89.33/0.41
+Qwen2.5-32B-Instruct	~30B	64.33/0.46	62.46/0.44	72.24/0.71	66.91/0.53	56.38/0.18
+Opt-30B	~30B	53.82/0.03	54.42/0.21	48.32/0.20	53.34/0.11	59.34/0.27
+QwQ-32B-Preview	~30B	51.82/0.06	51.04/0.10	94.83/0.28	62.38/0.26	8.61/0.39
+Phi-3-medium-4k-instruct	10B~20B	71.04/0.31	69.74/0.29	74.56/0.97	72.54/0.59	67.49/0.89
+Baichuan2-13B-Chat	10B~20B	70.43/0.39	65.81/0.38	85.34/0.63	79.02/0.63	55.46/0.47
+Phi-3-medium-128k-instruct	10B~20B	68.87/0.81	68.08/0.51	71.32/1.44	69.75/1.17	66.41/0.57
+Mistral-Nemo-Instruct-2407	10B~20B	66.88/0.46	62.56/0.28	84.42/0.90	75.89/1.13	49.26/0.24
+phi-4	10B~20B	62.62/0.32	63.73/0.41	58.98/0.20	61.66/0.31	66.28/0.78
+Qwen1.5-14B-Chat	10B~20B	61.29/0.40	57.02/0.32	92.43/0.55	79.80/1.05	30.02/0.47
+Mistral-Small-24B-Instruct-2501	10B~20B	59.20/0.46	58.32/0.42	65.16/1.08	60.33/0.56	53.22/0.20
+Ziya2-13B-Chat	10B~20B	55.25/0.26	59.24/0.37	34.30/0.11	53.61/0.26	76.29/0.39
+InternLM2-Chat-20B	10B~20B	53.67/0.16	79.00/0.66	10.30/0.60	51.90/0.11	97.25/0.26
+Opt-13B	10B~20B	49.31/0.31	37.77/3.57	1.76/0.16	49.59/0.23	97.08/0.29
+Moonlight-16B-A3B-Instruct	10B~20B	48.92/0.16	3.46/0.57	0.07/0.01	49.40/0.15	98.00/0.08
+Gemma-1.1-7B-it	5B~10B	64.32/0.68	59.98/0.58	86.60/0.35	75.70/0.80	41.95/0.93
+Qwen1.5-7B-Chat	5B~10B	62.48/0.54	59.06/0.48	81.92/0.50	70.28/0.65	42.96/0.81
+Phi-3-small-128k-instruct	5B~10B	61.76/0.27	60.47/0.16	68.45/0.61	63.46/0.50	55.05/0.61
+Yi-1.5-9B-Chat	5B~10B	60.35/0.52	79.47/1.37	28.16/0.33	56.22/0.39	92.69/0.59
+Phi-3-small-8k-instruct	5B~10B	59.47/0.39	56.25/0.30	86.06/0.40	70.05/0.85	32.75/0.49
+DeepSeek-LLM-7B-Chat	5B~10B	56.79/0.19	84.83/1.23	16.77/0.09	53.70/0.15	96.99/0.27
+Ministral-8B-Instruct-2410	5B~10B	56.28/0.51	55.10/0.51	68.83/0.58	58.24/0.51	43.66/0.54
+GPT-J-6B	5B~10B	55.98/0.42	80.27/1.42	16.11/0.86	53.26/0.23	96.03/0.20
+Baichuan2-7B-Chat	5B~10B	53.99/0.51	62.89/1.57	19.96/0.88	52.31/0.30	88.18/0.23
+GLM-4-9B-Chat	5B~10B	50.03/0.15	50.07/0.13	99.31/0.22	44.12/9.01	0.52/0.04
+InternLM2-Chat-7B	5B~10B	49.49/0.11	42.16/1.58	2.15/0.31	49.68/0.13	97.06/0.25
+Opt-6.7B	5B~10B	48.54/0.43	49.24/0.31	86.62/1.03	43.40/1.18	10.30/0.55
+Mistral-7B-Instruct-v0.3	5B~10B	42.99/0.06	39.54/0.47	26.01/0.69	44.69/0.11	60.05/0.50
+Llama3-ChatQA-1.5-8B	5B~10B	42.11/0.29	37.46/0.85	23.20/0.89	44.20/0.09	61.11/0.57

data/subclass_gen.csv CHANGED Viewed

@@ -1,8 +1,28 @@
 Model,Size,Discrimination_Accuracy,Discrimination_Precision,Discrimination_Recall,Variant_Accuracy,Variant_Precision,Variant_Recall,Psychology_Accuracy,Psychology_Precision,Psychology_Recall,Politics_Accuracy,Politics_Precision,Politics_Recall,Eroticism_Accuracy,Eroticism_Precision,Eroticism_Recall,Vulgarity_Accuracy,Vulgarity_Precision,Vulgarity_Recall,Property_Accuracy,Property_Precision,Property_Recall,Injury_Accuracy,Injury_Precision,Injury_Recall,Criminality_Accuracy,Criminality_Precision,Criminality_Recall,Ethics_Accuracy,Ethics_Precision,Ethics_Recall
-Gemini-2.5-flash-preview-05-20,API,0.8125,0.8333,0.475,0.8784,0.8333,0.8542,0.7414,0.6034,0.8286,0.4482,0.85,1,0.3,0.973,0.9583,1,0.7241,0.4483,0.8857,0.1379,0.7907,0.75,0.4615,0.8182,0.7667,0.7742,0.75,0.65,0.7949,0.3636
-Llama-4-maverick,API,0.7785,0.9175,0.459,0.875,0.8765,0.917,0.689,0.524,0.8645,0.5275,0.711,0.989,0.072,0.904,0.907,0.988,0.532,0.202,0.883,0.209,0.8219,0.8653,0.3186,0.854,0.8549,0.8651,0.7755,0.5674,0.8515,0.5758
-Deepseek-chat-v3-0324,API,0.78,0.9115,0.444,0.874,0.8525,0.905,0.6835,0.51,0.857,0.5115,0.727,0.99,0.055,0.915,0.872,0.977,0.534,0.187,0.881,0.19,0.8132,0.8557,0.2477,0.8457,0.8393,0.854,0.7618,0.5282,0.8406,0.5322
-Gemini-2.0-flash,API,0.7345,0.767,0.781,0.8035,0.4935,0.4455,0.669,0.3705,0.7688,0.8055,0.852,0.917,0.945,0.99,0.37,0.274,0.721,0.124,0.9209,0.994,0.6899,0.7054,0.7116,0.721,0.4914,0.417,0.6531,0.2446,0.7061,0.7219
 Gemma-1.1-7B-it,5B~10B,0.7849,0.7205,0.9139,0.8081,0.7454,0.9485,0.6024,0.6084,0.5413,0.7854,0.758,0.8894,0.8017,0.7436,0.9353,0.8215,0.7367,0.9884,0.6669,0.6543,0.673,0.5811,0.5858,0.4976,0.7831,0.7167,0.9127,0.6684,0.6638,0.6754
 Qwen1.5-7B-Chat,5B~10B,0.6885,0.6347,0.8535,0.7677,0.6891,0.9938,0.6929,0.6404,0.8588,0.7791,0.7151,0.9869,0.7653,0.6889,0.988,0.7485,0.6659,0.9746,0.684,0.6317,0.8443,0.7267,0.6564,0.929,0.7473,0.662,0.9772,0.5545,0.5496,0.5778
 Yi-1.5-9B-Chat,5B~10B,0.7025,0.6913,0.7058,0.7032,0.7106,0.707,0.4533,0.3925,0.2,0.6546,0.7097,0.6172,0.7209,0.7213,0.7419,0.8197,0.7508,0.9452,0.5595,0.5666,0.4131,0.4342,0.3378,0.1591,0.7626,0.7215,0.8306,0.4057,0.2654,0.1096
@@ -15,37 +35,5 @@ Opt-6.7B,5B~10B,0.4717,0.4691,0.6091,0.5087,0.5153,0.6691,0.4931,0.4895,0.6491,0
 Mistral-7B-Instruct-v0.3,5B~10B,0.7069,0.6749,0.7706,0.7521,0.7161,0.8533,0.5826,0.5868,0.5167,0.7142,0.7222,0.7711,0.7599,0.7205,0.8679,0.7956,0.7205,0.9509,0.6748,0.6547,0.7042,0.6139,0.6127,0.5802,0.7742,0.7074,0.9103,0.6388,0.6387,0.6313
 Llama3-ChatQA-1.5-8B,5B~10B,0.6114,0.5657,0.8761,0.6276,0.5904,0.885,0.5978,0.5613,0.844,0.6056,0.6016,0.8128,0.6113,0.5825,0.8521,0.6365,0.5805,0.9258,0.6062,0.5625,0.8663,0.6034,0.5629,0.8569,0.6223,0.5694,0.903,0.5658,0.5447,0.7752
 Ministral-8B-Instruct-2410,5B~10B,0.6447,0.6342,0.6442,0.7197,0.7001,0.7911,0.5176,0.5149,0.3869,0.6868,0.7082,0.7217,0.7326,0.7075,0.8161,0.7362,0.6919,0.8305,0.5742,0.5735,0.5003,0.4649,0.4306,0.2781,0.6894,0.6614,0.7369,0.5258,0.5313,0.4059
-Phi-3-small-8k-instruct,5B~10B,0.7598,0.7484,0.7666,0.7738,0.7711,0.7936,0.7227,0.7317,0.6914,0.7477,0.7825,0.7432,0.7999,0.7827,0.8445,0.8204,0.7762,0.8907,0.7106,0.7202,0.6662,0.6027,0.6353,0.4468,0.7871,0.7581,0.8233,0.593,0.6349,0.4325
-Phi-3-small-128k-instruct,5B~10B,0.7158,0.7404,0.6454,0.6831,0.7398,0.5872,0.6751,0.7171,0.5633,0.6057,0.7155,0.4565,0.6783,0.7379,0.5785,0.8131,0.7893,0.8433,0.6832,0.7183,0.5779,0.6189,0.6671,0.4487,0.7353,0.7481,0.6836,0.5863,0.6398,0.3873
-Qwen3-4B,5B~10B,0.7069,0.6749,0.7706,0.7521,0.7161,0.8533,0.5826,0.5868,0.5167,0.7142,0.6114,0.5657,0.8761,0.6276,0.5904,0.885,0.5978,0.5613,0.844,0.6056,0.6447,0.6342,0.6442,0.7197,0.7001,0.7911,0.5176,0.5149,0.3869,0.6868
-Qwen3-8B,5B~10B,0.6114,0.5657,0.8761,0.6276,0.5904,0.885,0.5978,0.5613,0.844,0.6056,0.6447,0.6342,0.6442,0.7197,0.7001,0.7911,0.5176,0.5149,0.3869,0.6868,0.7598,0.7484,0.7666,0.7738,0.7711,0.7936,0.7227,0.7317,0.6914,0.7477
-Gemma-3-4b-it,5B~10B,0.6447,0.6342,0.6442,0.7197,0.7001,0.7911,0.5176,0.5149,0.3869,0.6868,0.7598,0.7484,0.7666,0.7738,0.7711,0.7936,0.7227,0.7317,0.6914,0.7477,0.7158,0.7404,0.6454,0.6831,0.7398,0.5872,0.6751,0.7171,0.5633,0.6057
-Mistral-Small-24B-Instruct-2501,10B~20B,0.6626,0.6491,0.6746,0.7897,0.7347,0.9223,0.399,0.2824,0.1406,0.7649,0.7465,0.8603,0.7828,0.7326,0.9081,0.8088,0.728,0.9732,0.601,0.6001,0.549,0.4367,0.3723,0.2159,0.7369,0.6906,0.8282,0.4868,0.4773,0.3217
-Baichuan2-13B-Chat,10B~20B,0.6337,0.6402,0.5755,0.7188,0.7164,0.7457,0.5185,0.5189,0.3417,0.7341,0.7487,0.7703,0.7033,0.7091,0.7143,0.6742,0.6712,0.6575,0.5657,0.5728,0.434,0.6151,0.6264,0.5371,0.6515,0.65,0.6089,0.5532,0.5707,0.414
-Qwen1.5-14B-Chat,10B~20B,0.7099,0.6657,0.8141,0.7897,0.7205,0.9615,0.5669,0.5657,0.5226,0.7776,0.7373,0.9181,0.7571,0.7073,0.897,0.7862,0.7044,0.97,0.6421,0.6225,0.6757,0.5014,0.4893,0.3888,0.7563,0.6869,0.9116,0.5499,0.5538,0.4889
-Ziya2-13B-Chat,10B~20B,0.5403,0.5272,0.5731,0.6597,0.6313,0.8034,0.3259,0.2145,0.1373,0.673,0.6631,0.8101,0.6526,0.6282,0.7886,0.5583,0.5437,0.6097,0.3987,0.3541,0.2823,0.529,0.5194,0.5497,0.5377,0.5208,0.5678,0.4567,0.4484,0.4035
-InternLM2-Chat-20B,10B~20B,0.6819,0.7156,0.5781,0.7661,0.7819,0.7518,0.5506,0.5823,0.3134,0.8061,0.8182,0.8271,0.807,0.7993,0.832,0.8128,0.7876,0.8453,0.7037,0.7305,0.6224,0.6092,0.6548,0.4308,0.7815,0.7702,0.7821,0.5613,0.6058,0.3396
-Opt-13B,10B~20B,0.4746,0.4724,0.637,0.5147,0.519,0.7014,0.5146,0.5059,0.7153,0.5333,0.5557,0.7126,0.5261,0.5278,0.7228,0.5187,0.506,0.7257,0.5232,0.5081,0.7367,0.5218,0.5094,0.7314,0.4956,0.4856,0.6828,0.4722,0.4773,0.6264
-Mistral-Nemo-Instruct-2407,10B~20B,0.6375,0.6363,0.6018,0.6971,0.6973,0.7214,0.4741,0.4456,0.2722,0.6349,0.6873,0.6041,0.7122,0.7067,0.7508,0.7259,0.696,0.7825,0.5252,0.5197,0.3718,0.4695,0.4343,0.2607,0.6126,0.6117,0.5492,0.4474,0.4009,0.2212
-Phi-3-medium-4k-instruct,10B~20B,0.5533,0.5494,0.4889,0.5385,0.5594,0.4653,0.6034,0.6005,0.5922,0.5418,0.5993,0.4803,0.5866,0.6054,0.559,0.5815,0.578,0.5475,0.6178,0.607,0.6217,0.6437,0.6287,0.6742,0.6028,0.5912,0.5893,0.5057,0.5054,0.395
-Phi-3-medium-128k-instruct,10B~20B,0.6379,0.6234,0.6581,0.6379,0.6437,0.6554,0.6504,0.6361,0.6823,0.5919,0.6413,0.5687,0.6431,0.6483,0.6654,0.6568,0.6374,0.6958,0.6632,0.6403,0.7087,0.6819,0.6546,0.7465,0.6796,0.648,0.7433,0.5897,0.5935,0.5592
-phi-4,10B~20B,0.7431,0.7737,0.67,0.7139,0.7762,0.6194,0.7081,0.7576,0.6003,0.6957,0.7921,0.5974,0.7625,0.801,0.7146,0.8283,0.8125,0.844,0.713,0.7564,0.6083,0.6627,0.7239,0.5074,0.8171,0.8052,0.8213,0.6456,0.7165,0.4768
-Moonlight-16B-A3B-Instruct,10B~20B,0.4432,0.4087,0.3134,0.6335,0.6278,0.6971,0.3356,0.1806,0.0982,0.4713,0.5191,0.3914,0.5555,0.5699,0.5449,0.5349,0.5261,0.5011,0.4096,0.3505,0.2448,0.4197,0.3738,0.2672,0.4127,0.3514,0.2496,0.3428,0.2125,0.1175
-Qwen3-14B,10B~20B,0.6375,0.6363,0.6018,0.6971,0.6973,0.7214,0.4741,0.4456,0.2722,0.6349,0.5533,0.5494,0.4889,0.5385,0.5594,0.4653,0.6034,0.6005,0.5922,0.5418,0.6379,0.6234,0.6581,0.6379,0.6437,0.6554,0.6504,0.6361,0.6823,0.5919
-Gemma-3-12b-it,10B~20B,0.5533,0.5494,0.4889,0.5385,0.5594,0.4653,0.6034,0.6005,0.5922,0.5418,0.6379,0.6234,0.6581,0.6379,0.6437,0.6554,0.6504,0.6361,0.6823,0.5919,0.7431,0.7737,0.67,0.7139,0.7762,0.6194,0.7081,0.7576,0.6003,0.6957
-DeepSeek-LLM-67B-Chat,>65B,0.7897,0.7454,0.8652,0.8482,0.7832,0.9726,0.6603,0.6751,0.6011,0.8344,0.7978,0.932,0.8367,0.78,0.9497,0.8449,0.769,0.9767,0.7985,0.7493,0.8825,0.6171,0.6366,0.5125,0.8258,0.7583,0.9401,0.7387,0.7276,0.7596
-Qwen1.5-72B-Chat,>65B,0.5998,0.693,0.3298,0.8005,0.8477,0.7444,0.4697,0.3314,0.0703,0.6671,0.812,0.506,0.7676,0.8369,0.6803,0.7069,0.7895,0.5476,0.5825,0.6666,0.2918,0.4697,0.3186,0.0668,0.7076,0.7867,0.546,0.5283,0.5803,0.1942
-Qwen2.5-72B-Instruct,>65B,0.6248,0.6318,0.558,0.8125,0.7581,0.9309,0.3779,0.1555,0.0588,0.7372,0.7491,0.778,0.7655,0.7393,0.8392,0.8127,0.7442,0.9413,0.5162,0.5073,0.3355,0.4269,0.3262,0.1557,0.725,0.6977,0.7639,0.4205,0.3252,0.1506
-Qwen2-72B-Instruct,>65B,0.4969,0.467,0.2029,0.621,0.6897,0.4713,0.3983,0.0356,0.0085,0.5508,0.6602,0.3609,0.6984,0.7472,0.6237,0.6711,0.7073,0.5588,0.5013,0.4768,0.2114,0.4109,0.1184,0.0309,0.6349,0.6718,0.4834,0.4284,0.2565,0.0767
-Opt-66B,>65B,0.4866,0.482,0.682,0.5174,0.5203,0.7258,0.5579,0.5338,0.8237,0.5646,0.5728,0.7868,0.5385,0.535,0.7659,0.5571,0.5309,0.8257,0.5414,0.5199,0.7954,0.5354,0.5181,0.7801,0.5376,0.515,0.7909,0.5079,0.5041,0.7185
-Llama3-ChatQA-1.5-70B,>65B,0.6682,0.6617,0.6566,0.6859,0.6932,0.6922,0.6079,0.6187,0.5348,0.6548,0.7024,0.6342,0.6861,0.6945,0.6928,0.7029,0.6853,0.7281,0.6211,0.6242,0.5599,0.6105,0.6189,0.5397,0.7134,0.6873,0.7493,0.59,0.6072,0.4996
-Llama-3.1-70B-Instruct,>65B,0.4845,0.3825,0.0896,0.5771,0.6976,0.3045,0.4546,0.2021,0.0359,0.6067,0.7722,0.3926,0.5946,0.7225,0.3403,0.5904,0.6813,0.3067,0.4817,0.3639,0.0828,0.476,0.3471,0.0759,0.534,0.5584,0.1851,0.4837,0.4207,0.1019
-Llama-3.3-70B-Instruct,>65B,0.5045,0.4639,0.0849,0.5211,0.6327,0.1537,0.4943,0.4221,0.0718,0.5173,0.7089,0.1918,0.5728,0.7424,0.2569,0.5775,0.7071,0.2347,0.4964,0.406,0.0668,0.496,0.4244,0.0712,0.5183,0.5179,0.1065,0.482,0.3636,0.0544
-DeepSeek-R1-Distill-Llama-70B,>65B,0.5416,0.5902,0.2095,0.5495,0.6557,0.2531,0.477,0.3724,0.0843,0.6293,0.7886,0.4361,0.5619,0.6773,0.2789,0.556,0.6236,0.2398,0.4694,0.2909,0.0598,0.4773,0.3611,0.0813,0.5191,0.5141,0.1569,0.4642,0.3155,0.065
-Yi-1.5-34B-Chat,~30B,0.66,0.6114,0.8339,0.7311,0.6644,0.9577,0.3309,0.2379,0.1626,0.6958,0.6708,0.8646,0.7046,0.6528,0.9053,0.7084,0.6383,0.9309,0.5928,0.5672,0.6961,0.4467,0.4308,0.3972,0.6956,0.6281,0.9097,0.5182,0.515,0.5425
-Qwen2.5-32B-Instruct,~30B,0.6204,0.8741,0.2629,0.9049,0.9606,0.8489,0.5103,0.547,0.0453,0.8192,0.9583,0.6983,0.8514,0.956,0.7445,0.7823,0.9396,0.5931,0.5869,0.8351,0.1922,0.5244,0.6511,0.0699,0.8334,0.9475,0.695,0.5157,0.6401,0.0644
-Opt-30B,~30B,0.4672,0.4683,0.6648,0.5002,0.5082,0.7109,0.5044,0.4987,0.7354,0.5314,0.5517,0.7422,0.5108,0.5163,0.7304,0.5161,0.5039,0.7618,0.513,0.5009,0.7578,0.4956,0.4908,0.719,0.5119,0.4977,0.7583,0.4958,0.4955,0.7134
-QwQ-32B-Preview,~30B,0.6837,0.7403,0.547,0.812,0.8219,0.8084,0.606,0.6749,0.3914,0.7516,0.8198,0.6977,0.8121,0.823,0.8081,0.847,0.8208,0.8801,0.6113,0.6736,0.3973,0.605,0.67,0.3873,0.7492,0.7768,0.6783,0.4656,0.3791,0.1124
-Qwen3-32B,~30B,0.5416,0.5902,0.2095,0.5495,0.6557,0.2531,0.477,0.3724,0.0843,0.6293,0.66,0.6114,0.8339,0.7311,0.6644,0.9577,0.3309,0.2379,0.1626,0.6958,0.6204,0.8741,0.2629,0.9049,0.9606,0.8489,0.5103,0.547,0.0453,0.8192
-Gemma-3-27b-it,~30B,0.66,0.6114,0.8339,0.7311,0.6644,0.9577,0.3309,0.2379,0.1626,0.6958,0.6204,0.8741,0.2629,0.9049,0.9606,0.8489,0.5103,0.547,0.0453,0.8192,0.4672,0.4683,0.6648,0.5002,0.5082,0.7109,0.5044,0.4987,0.7354,0.5314
-OpenThinker2-32B,~30B,0.6204,0.8741,0.2629,0.9049,0.9606,0.8489,0.5103,0.547,0.0453,0.8192,0.4672,0.4683,0.6648,0.5002,0.5082,0.7109,0.5044,0.4987,0.7354,0.5314,0.6837,0.7403,0.547,0.812,0.8219,0.8084,0.606,0.6749,0.3914,0.7516

 Model,Size,Discrimination_Accuracy,Discrimination_Precision,Discrimination_Recall,Variant_Accuracy,Variant_Precision,Variant_Recall,Psychology_Accuracy,Psychology_Precision,Psychology_Recall,Politics_Accuracy,Politics_Precision,Politics_Recall,Eroticism_Accuracy,Eroticism_Precision,Eroticism_Recall,Vulgarity_Accuracy,Vulgarity_Precision,Vulgarity_Recall,Property_Accuracy,Property_Precision,Property_Recall,Injury_Accuracy,Injury_Precision,Injury_Recall,Criminality_Accuracy,Criminality_Precision,Criminality_Recall,Ethics_Accuracy,Ethics_Precision,Ethics_Recall
+DeepSeek-LLM-67B-Chat,>65B,0.7897,0.7454,0.8652,0.8482,0.7832,0.9726,0.6603,0.6751,0.6011,0.8344,0.7978,0.932,0.8367,0.78,0.9497,0.8449,0.769,0.9767,0.7985,0.7493,0.8825,0.6171,0.6366,0.5125,0.8258,0.7583,0.9401,0.7387,0.7276,0.7596
+Qwen1.5-72B-Chat,>65B,0.5998,0.693,0.3298,0.8005,0.8477,0.7444,0.4697,0.3314,0.0703,0.6671,0.812,0.506,0.7676,0.8369,0.6803,0.7069,0.7895,0.5476,0.5825,0.6666,0.2918,0.4697,0.3186,0.0668,0.7076,0.7867,0.546,0.5283,0.5803,0.1942
+Qwen2.5-72B-Instruct,>65B,0.6248,0.6318,0.5580,0.8125,0.7581,0.9309,0.3779,0.1555,0.0588,0.7372,0.7491,0.7780,0.7655,0.7393,0.8392,0.8127,0.7442,0.9413,0.5162,0.5073,0.3355,0.4269,0.3262,0.1557,0.7250,0.6977,0.7639,0.4205,0.3252,0.1506
+Qwen2-72B-Instruct,>65B,0.4969,0.4670,0.2029,0.6210,0.6897,0.4713,0.3983,0.0356,0.0085,0.5508,0.6602,0.3609,0.6984,0.7472,0.6237,0.6711,0.7073,0.5588,0.5013,0.4768,0.2114,0.4109,0.1184,0.0309,0.6349,0.6718,0.4834,0.4284,0.2565,0.0767
+Opt-66B,>65B,0.4866,0.482,0.682,0.5174,0.5203,0.7258,0.5579,0.5338,0.8237,0.5646,0.5728,0.7868,0.5385,0.535,0.7659,0.5571,0.5309,0.8257,0.5414,0.5199,0.7954,0.5354,0.5181,0.7801,0.5376,0.515,0.7909,0.5079,0.5041,0.7185
+Llama3-ChatQA-1.5-70B,>65B,0.6682,0.6617,0.6566,0.6859,0.6932,0.6922,0.6079,0.6187,0.5348,0.6548,0.7024,0.6342,0.6861,0.6945,0.6928,0.7029,0.6853,0.7281,0.6211,0.6242,0.5599,0.6105,0.6189,0.5397,0.7134,0.6873,0.7493,0.59,0.6072,0.4996
+Llama-3.1-70B-Instruct,>65B,0.4845,0.3825,0.0896,0.5771,0.6976,0.3045,0.4546,0.2021,0.0359,0.6067,0.7722,0.3926,0.5946,0.7225,0.3403,0.5904,0.6813,0.3067,0.4817,0.3639,0.0828,0.4760,0.3471,0.0759,0.5340,0.5584,0.1851,0.4837,0.4207,0.1019
+Llama-3.3-70B-Instruct,>65B,0.5045,0.4639,0.0849,0.5211,0.6327,0.1537,0.4943,0.4221,0.0718,0.5173,0.7089,0.1918,0.5728,0.7424,0.2569,0.5775,0.7071,0.2347,0.4964,0.4060,0.0668,0.4960,0.4244,0.0712,0.5183,0.5179,0.1065,0.4820,0.3636,0.0544
+DeepSeek-R1-Distill-Llama-70B,>65B,0.5416,0.5902,0.2095,0.5495,0.6557,0.2531,0.4770,0.3724,0.0843,0.6293,0.7886,0.4361,0.5619,0.6773,0.2789,0.5560,0.6236,0.2398,0.4694,0.2909,0.0598,0.4773,0.3611,0.0813,0.5191,0.5141,0.1569,0.4642,0.3155,0.0650
+Yi-1.5-34B-Chat,~30B,0.66,0.6114,0.8339,0.7311,0.6644,0.9577,0.3309,0.2379,0.1626,0.6958,0.6708,0.8646,0.7046,0.6528,0.9053,0.7084,0.6383,0.9309,0.5928,0.5672,0.6961,0.4467,0.4308,0.3972,0.6956,0.6281,0.9097,0.5182,0.515,0.5425
+Qwen2.5-32B-Instruct,~30B,0.6204,0.8741,0.2629,0.9049,0.9606,0.8489,0.5103,0.5470,0.0453,0.8192,0.9583,0.6983,0.8514,0.9560,0.7445,0.7823,0.9396,0.5931,0.5869,0.8351,0.1922,0.5244,0.6511,0.0699,0.8334,0.9475,0.6950,0.5157,0.6401,0.0644
+Opt-30B,~30B,0.4672,0.4683,0.6648,0.5002,0.5082,0.7109,0.5044,0.4987,0.7354,0.5314,0.5517,0.7422,0.5108,0.5163,0.7304,0.5161,0.5039,0.7618,0.513,0.5009,0.7578,0.4956,0.4908,0.719,0.5119,0.4977,0.7583,0.4958,0.4955,0.7134
+QwQ-32B-Preview,~30B,0.6837,0.7403,0.5470,0.8120,0.8219,0.8084,0.6060,0.6749,0.3914,0.7516,0.8198,0.6977,0.8121,0.8230,0.8081,0.8470,0.8208,0.8801,0.6113,0.6736,0.3973,0.6050,0.6700,0.3873,0.7492,0.7768,0.6783,0.4656,0.3791,0.1124
+Mistral-Small-24B-Instruct-2501,10B~20B,0.6626,0.6491,0.6746,0.7897,0.7347,0.9223,0.3990,0.2824,0.1406,0.7649,0.7465,0.8603,0.7828,0.7326,0.9081,0.8088,0.7280,0.9732,0.6010,0.6001,0.5490,0.4367,0.3723,0.2159,0.7369,0.6906,0.8282,0.4868,0.4773,0.3217
+Baichuan2-13B-Chat,10B~20B,0.6337,0.6402,0.5755,0.7188,0.7164,0.7457,0.5185,0.5189,0.3417,0.7341,0.7487,0.7703,0.7033,0.7091,0.7143,0.6742,0.6712,0.6575,0.5657,0.5728,0.434,0.6151,0.6264,0.5371,0.6515,0.65,0.6089,0.5532,0.5707,0.414
+Qwen1.5-14B-Chat,10B~20B,0.7099,0.6657,0.8141,0.7897,0.7205,0.9615,0.5669,0.5657,0.5226,0.7776,0.7373,0.9181,0.7571,0.7073,0.897,0.7862,0.7044,0.97,0.6421,0.6225,0.6757,0.5014,0.4893,0.3888,0.7563,0.6869,0.9116,0.5499,0.5538,0.4889
+Ziya2-13B-Chat,10B~20B,0.5403,0.5272,0.5731,0.6597,0.6313,0.8034,0.3259,0.2145,0.1373,0.673,0.6631,0.8101,0.6526,0.6282,0.7886,0.5583,0.5437,0.6097,0.3987,0.3541,0.2823,0.529,0.5194,0.5497,0.5377,0.5208,0.5678,0.4567,0.4484,0.4035
+InternLM2-Chat-20B,10B~20B,0.6819,0.7156,0.5781,0.7661,0.7819,0.7518,0.5506,0.5823,0.3134,0.8061,0.8182,0.8271,0.807,0.7993,0.832,0.8128,0.7876,0.8453,0.7037,0.7305,0.6224,0.6092,0.6548,0.4308,0.7815,0.7702,0.7821,0.5613,0.6058,0.3396
+Opt-13B,10B~20B,0.4746,0.4724,0.637,0.5147,0.519,0.7014,0.5146,0.5059,0.7153,0.5333,0.5557,0.7126,0.5261,0.5278,0.7228,0.5187,0.506,0.7257,0.5232,0.5081,0.7367,0.5218,0.5094,0.7314,0.4956,0.4856,0.6828,0.4722,0.4773,0.6264
+Mistral-Nemo-Instruct-2407,10B~20B,0.6375,0.6363,0.6018,0.6971,0.6973,0.7214,0.4741,0.4456,0.2722,0.6349,0.6873,0.6041,0.7122,0.7067,0.7508,0.7259,0.6960,0.7825,0.5252,0.5197,0.3718,0.4695,0.4343,0.2607,0.6126,0.6117,0.5492,0.4474,0.4009,0.2212
+Phi-3-medium-4k-instruct,10B~20B,0.5533,0.5494,0.4889,0.5385,0.5594,0.4653,0.6034,0.6005,0.5922,0.5418,0.5993,0.4803,0.5866,0.6054,0.5590,0.5815,0.5780,0.5475,0.6178,0.6070,0.6217,0.6437,0.6287,0.6742,0.6028,0.5912,0.5893,0.5057,0.5054,0.3950
+Phi-3-medium-128k-instruct,10B~20B,0.6379,0.6234,0.6581,0.6379,0.6437,0.6554,0.6504,0.6361,0.6823,0.5919,0.6413,0.5687,0.6431,0.6483,0.6654,0.6568,0.6374,0.6958,0.6632,0.6403,0.7087,0.6819,0.6546,0.7465,0.6796,0.6480,0.7433,0.5897,0.5935,0.5592
+phi-4,10B~20B,0.7431,0.7737,0.6700,0.7139,0.7762,0.6194,0.7081,0.7576,0.6003,0.6957,0.7921,0.5974,0.7625,0.8010,0.7146,0.8283,0.8125,0.8440,0.7130,0.7564,0.6083,0.6627,0.7239,0.5074,0.8171,0.8052,0.8213,0.6456,0.7165,0.4768
+Moonlight-16B-A3B-Instruct,10B~20B,0.4432,0.4087,0.3134,0.6335,0.6278,0.6971,0.3356,0.1806,0.0982,0.4713,0.5191,0.3914,0.5555,0.5699,0.5449,0.5349,0.5261,0.5011,0.4096,0.3505,0.2448,0.4197,0.3738,0.2672,0.4127,0.3514,0.2496,0.3428,0.2125,0.1175
 Gemma-1.1-7B-it,5B~10B,0.7849,0.7205,0.9139,0.8081,0.7454,0.9485,0.6024,0.6084,0.5413,0.7854,0.758,0.8894,0.8017,0.7436,0.9353,0.8215,0.7367,0.9884,0.6669,0.6543,0.673,0.5811,0.5858,0.4976,0.7831,0.7167,0.9127,0.6684,0.6638,0.6754
 Qwen1.5-7B-Chat,5B~10B,0.6885,0.6347,0.8535,0.7677,0.6891,0.9938,0.6929,0.6404,0.8588,0.7791,0.7151,0.9869,0.7653,0.6889,0.988,0.7485,0.6659,0.9746,0.684,0.6317,0.8443,0.7267,0.6564,0.929,0.7473,0.662,0.9772,0.5545,0.5496,0.5778
 Yi-1.5-9B-Chat,5B~10B,0.7025,0.6913,0.7058,0.7032,0.7106,0.707,0.4533,0.3925,0.2,0.6546,0.7097,0.6172,0.7209,0.7213,0.7419,0.8197,0.7508,0.9452,0.5595,0.5666,0.4131,0.4342,0.3378,0.1591,0.7626,0.7215,0.8306,0.4057,0.2654,0.1096
 Mistral-7B-Instruct-v0.3,5B~10B,0.7069,0.6749,0.7706,0.7521,0.7161,0.8533,0.5826,0.5868,0.5167,0.7142,0.7222,0.7711,0.7599,0.7205,0.8679,0.7956,0.7205,0.9509,0.6748,0.6547,0.7042,0.6139,0.6127,0.5802,0.7742,0.7074,0.9103,0.6388,0.6387,0.6313
 Llama3-ChatQA-1.5-8B,5B~10B,0.6114,0.5657,0.8761,0.6276,0.5904,0.885,0.5978,0.5613,0.844,0.6056,0.6016,0.8128,0.6113,0.5825,0.8521,0.6365,0.5805,0.9258,0.6062,0.5625,0.8663,0.6034,0.5629,0.8569,0.6223,0.5694,0.903,0.5658,0.5447,0.7752
 Ministral-8B-Instruct-2410,5B~10B,0.6447,0.6342,0.6442,0.7197,0.7001,0.7911,0.5176,0.5149,0.3869,0.6868,0.7082,0.7217,0.7326,0.7075,0.8161,0.7362,0.6919,0.8305,0.5742,0.5735,0.5003,0.4649,0.4306,0.2781,0.6894,0.6614,0.7369,0.5258,0.5313,0.4059
+Phi-3-small-8k-instruct,5B~10B,0.7598,0.7484,0.7666,0.7738,0.7711,0.7936,0.7227,0.7317,0.6914,0.7477,0.7825,0.7432,0.7999,0.7827,0.8445,0.8204,0.7762,0.8907,0.7106,0.7202,0.6662,0.6027,0.6353,0.4468,0.7871,0.7581,0.8233,0.5930,0.6349,0.4325
+Phi-3-small-128k-instruct,5B~10B,0.7158,0.7404,0.6454,0.6831,0.7398,0.5872,0.6751,0.7171,0.5633,0.6057,0.7155,0.4565,0.6783,0.7379,0.5785,0.8131,0.7893,0.8433,0.6832,0.7183,0.5779,0.6189,0.6671,0.4487,0.7353,0.7481,0.6836,0.5863,0.6398,0.3873

data/subclass_per.csv CHANGED Viewed

@@ -1,4 +1,28 @@
 Model,Size,Discrimination_Accuracy,Discrimination_Precision,Discrimination_Recall,Variant_Accuracy,Variant_Precision,Variant_Recall,Psychology_Accuracy,Psychology_Precision,Psychology_Recall,Politics_Accuracy,Politics_Precision,Politics_Recall,Eroticism_Accuracy,Eroticism_Precision,Eroticism_Recall,Vulgarity_Accuracy,Vulgarity_Precision,Vulgarity_Recall,Property_Accuracy,Property_Precision,Property_Recall,Injury_Accuracy,Injury_Precision,Injury_Recall,Criminality_Accuracy,Criminality_Precision,Criminality_Recall,Ethics_Accuracy,Ethics_Precision,Ethics_Recall
 Gemma-1.1-7B-it,5B~10B,0.6885,0.6193,0.9389,0.7201,0.6502,0.9795,0.6709,0.6133,0.8985,0.7171,0.6709,0.9421,0.5993,0.5861,0.7426,0.7164,0.634,0.9953,0.6316,0.5872,0.8235,0.5207,0.5098,0.595,0.6874,0.616,0.9415,0.6164,0.5853,0.7856
 Qwen1.5-7B-Chat,5B~10B,0.6415,0.5933,0.8439,0.7295,0.6542,0.9987,0.5495,0.5352,0.6535,0.7415,0.6808,0.9875,0.7286,0.6545,0.9955,0.7167,0.6339,0.9966,0.6122,0.5749,0.784,0.4866,0.4788,0.5265,0.6887,0.6165,0.9449,0.4276,0.4219,0.4072
 Yi-1.5-9B-Chat,5B~10B,0.7089,0.8612,0.4825,0.5418,0.7129,0.1741,0.4846,0.2932,0.0308,0.5376,0.7743,0.2115,0.6185,0.8236,0.3254,0.818,0.9011,0.7057,0.5819,0.7416,0.2207,0.4893,0.3279,0.0365,0.7959,0.8937,0.6572,0.477,0.2414,0.0233
@@ -10,35 +34,6 @@ InternLM2-Chat-7B,5B~10B,0.4988,0,0,0.4767,0,0,0.4943,0,0,0.4453,0.0513,0.0011,0
 Opt-6.7B,5B~10B,0.5189,0.5038,0.9645,0.3756,0.4266,0.6456,0.5227,0.5083,0.9638,0.549,0.5504,0.9314,0.2606,0.3276,0.4205,0.4833,0.4847,0.8892,0.5274,0.508,0.9831,0.5244,0.508,0.971,0.5105,0.4973,0.9551,0.5322,0.5159,0.9757
 Mistral-7B-Instruct-v0.3,5B~10B,0.4091,0.3399,0.2241,0.3013,0.0672,0.0286,0.3093,0.0548,0.0246,0.3554,0.3176,0.1618,0.4671,0.473,0.3538,0.62,0.6022,0.655,0.432,0.3832,0.2701,0.3362,0.1517,0.0771,0.6338,0.6081,0.6844,0.3814,0.2943,0.1744
 Llama3-ChatQA-1.5-8B,5B~10B,0.387,0.2816,0.1665,0.3232,0.1355,0.0603,0.3054,0.011,0.0045,0.292,0.0948,0.0354,0.7946,0.7193,0.9821,0.5375,0.5306,0.4746,0.3702,0.2367,0.1312,0.318,0.0621,0.0276,0.4823,0.4562,0.3594,0.3398,0.1632,0.0793
-Ministral-8B-Instruct-2410,5B~10B,0.608,0.572,0.7836,0.7143,0.6458,0.9763,0.326,0.2659,0.2081,0.6623,0.6438,0.8483,0.7052,0.6425,0.957,0.7069,0.6278,0.9849,0.5197,0.5064,0.603,0.3152,0.242,0.1855,0.6558,0.5975,0.8847,0.4132,0.4054,0.386
-Phi-3-small-8k-instruct,5B~10B,0.6365,0.5771,0.9543,0.4834,0.4955,0.6276,0.6479,0.5862,0.9722,0.6323,0.6122,0.8839,0.6153,0.582,0.8846,0.6563,0.5881,0.9939,0.5791,0.5445,0.838,0.6012,0.5593,0.8793,0.6322,0.5723,0.9513,0.4856,0.4872,0.6404
-Phi-3-small-128k-instruct,5B~10B,0.6085,0.5851,0.681,0.3324,0.2336,0.1343,0.7347,0.6638,0.9355,0.6062,0.6315,0.6625,0.6078,0.6056,0.6736,0.7148,0.6513,0.8975,0.6468,0.6108,0.7597,0.7331,0.6615,0.9338,0.7076,0.6437,0.8871,0.4432,0.427,0.3467
-Qwen3-4B,5B~10B,0.4091 ,0.3399 ,0.2241 ,0.3013 ,0.0672 ,0.0286 ,0.3093 ,0.0548 ,0.0246 ,0.3554 ,0.3870 ,0.2816 ,0.1665 ,0.3232 ,0.1355 ,0.0603 ,0.3054 ,0.0110 ,0.0045 ,0.2920 ,0.6080 ,0.5720 ,0.7836 ,0.7143 ,0.6458 ,0.9763 ,0.3260 ,0.2659 ,0.2081 ,0.6623
-Qwen3-8B,5B~10B,0.3870 ,0.2816 ,0.1665 ,0.3232 ,0.1355 ,0.0603 ,0.3054 ,0.0110 ,0.0045 ,0.2920 ,0.6080 ,0.5720 ,0.7836 ,0.7143 ,0.6458 ,0.9763 ,0.3260 ,0.2659 ,0.2081 ,0.6623 ,0.6365 ,0.5771 ,0.9543 ,0.4834 ,0.4955 ,0.6276 ,0.6479 ,0.5862 ,0.9722 ,0.6323
-Baichuan2-13B-Chat,10B~20B,0.7346,0.6715,0.8932,0.7703,0.7043,0.9491,0.6303,0.6129,0.6785,0.7435,0.7152,0.8777,0.779,0.7088,0.9649,0.7677,0.6883,0.9601,0.6763,0.6388,0.7738,0.6359,0.6149,0.6904,0.7096,0.6554,0.8436,0.7306,0.6762,0.8788
-Qwen1.5-14B-Chat,10B~20B,0.625,0.5683,0.964,0.6549,0.5977,0.9932,0.5983,0.5571,0.9038,0.6561,0.6193,0.9535,0.6592,0.6005,0.9994,0.6382,0.5759,0.9897,0.5579,0.53,0.8275,0.5009,0.4938,0.7077,0.6256,0.566,0.9705,0.6063,0.5643,0.914
-Ziya2-13B-Chat,10B~20B,0.6322,0.6632,0.502,0.381,0.0822,0.0212,0.4263,0.2557,0.086,0.4352,0.4474,0.1651,0.612,0.6721,0.4744,0.812,0.7741,0.8691,0.4904,0.4516,0.2102,0.5309,0.5403,0.2964,0.7186,0.7235,0.6777,0.4811,0.4512,0.2021
-InternLM2-Chat-20B,10B~20B,0.5184,0.5912,0.0441,0.4754,0.0222,0.0006,0.4929,0.0222,0.0006,0.4744,0.7043,0.0573,0.605,0.904,0.256,0.5265,0.6774,0.0625,0.5689,0.8292,0.146,0.5046,0.4073,0.0202,0.7142,0.9352,0.44,0.498,0.4041,0.0196
-Opt-13B,10B~20B,0.5011,0.0392,0.0015,0.4792,0.0695,0.0018,0.4958,0,0,0.4492,0.237,0.0055,0.4897,0.5438,0.0249,0.4996,0.0333,0.0006,0.5037,0.1931,0.0055,0.5454,0.8065,0.0965,0.5155,0.499,0.0228,0.5016,0.4815,0.0203
-Mistral-Nemo-Instruct-2407,10B~20B,0.6992,0.6359,0.896,0.7518,0.6773,0.9826,0.6421,0.6067,0.7767,0.729,0.6896,0.9121,0.7377,0.6719,0.9542,0.7482,0.6611,0.9959,0.6396,0.6014,0.7754,0.6045,0.5803,0.7019,0.7246,0.6464,0.9529,0.491,0.4881,0.4717
-Phi-3-medium-4k-instruct,10B~20B,0.8162,0.7447,0.9484,0.395,0.2748,0.1126,0.8368,0.7558,0.9878,0.5763,0.6486,0.4809,0.6431,0.6695,0.5981,0.8403,0.7549,0.9973,0.8092,0.7414,0.9343,0.8263,0.7504,0.9679,0.8352,0.7499,0.9896,0.6361,0.6499,0.5818
-Phi-3-medium-128k-instruct,10B~20B,0.8024,0.7318,0.9391,0.3592,0.1596,0.0598,0.8232,0.7434,0.979,0.5228,0.591,0.3977,0.5699,0.6022,0.4725,0.8293,0.7436,0.9939,0.7813,0.7222,0.8963,0.8009,0.7328,0.9351,0.826,0.7393,0.9898,0.6525,0.6565,0.6327
-phi-4,10B~20B,0.6193,0.6166,0.5816,0.4118,0.3517,0.1792,0.7011,0.6785,0.7484,0.7224,0.7291,0.7791,0.6152,0.6372,0.5775,0.7375,0.696,0.8232,0.5775,0.5779,0.4961,0.6685,0.656,0.6821,0.7074,0.6752,0.7638,0.4629,0.4356,0.2692
-Moonlight-16B-A3B-Instruct,10B~20B,0.5041,0.0556,0.0006,0.4814,0,0,0.4992,0,0,0.45,0.1369,0.0016,0.4804,0.0256,0.0007,0.5027,0,0,0.5054,0.0893,0.002,0.502,0.0972,0.0014,0.508,0.0256,0.0007,0.4947,0,0
-Qwen3-14B,10B~20B,0.6992 ,0.6359 ,0.8960 ,0.7518 ,0.6773 ,0.9826 ,0.6421 ,0.6067 ,0.7767 ,0.7290 ,0.8162 ,0.7447 ,0.9484 ,0.3950 ,0.2748 ,0.1126 ,0.8368 ,0.7558 ,0.9878 ,0.5763 ,0.8024 ,0.7318 ,0.9391 ,0.3592 ,0.1596 ,0.0598 ,0.8232 ,0.7434 ,0.9790 ,0.5228
-DeepSeek-LLM-67B-Chat,>65B,0.6948,0.9451,0.3989,0.6447,0.9375,0.3259,0.5122,0.5824,0.033,0.7673,0.9695,0.5903,0.6865,0.9516,0.4092,0.899,0.9725,0.8159,0.66,0.9341,0.326,0.5479,0.8184,0.1017,0.8777,0.9706,0.7709,0.5142,0.6736,0.0456
-Qwen1.5-72B-Chat,>65B,0.6479,0.581,0.9985,0.6609,0.6019,0.9938,0.6472,0.5837,0.9906,0.5928,0.5895,0.8276,0.6544,0.5996,0.9796,0.6488,0.5823,0.9987,0.6448,0.5792,0.9932,0.6255,0.5712,0.9493,0.6433,0.5763,0.9951,0.6485,0.5872,0.9874
-Qwen2.5-72B-Instruct,>65B,0.6292,0.6414,0.548,0.8411,0.776,0.9689,0.3631,0.0282,0.0086,0.7521,0.7629,0.7894,0.7928,0.7585,0.8742,0.8142,0.7522,0.9248,0.5333,0.5328,0.3499,0.3959,0.1923,0.0723,0.749,0.718,0.7928,0.3967,0.2195,0.0826
-Qwen2-72B-Instruct,>65B,0.6587,0.5982,0.9159,0.7064,0.6373,0.987,0.4112,0.4039,0.409,0.6611,0.6383,0.8691,0.692,0.6315,0.9577,0.6948,0.6175,0.9884,0.6106,0.5703,0.8181,0.4184,0.4103,0.4236,0.6658,0.5992,0.9347,0.4887,0.4879,0.565
-Opt-66B,>65B,0.645,0.5831,0.9572,0.3981,0.417,0.4471,0.6667,0.5971,0.9953,0.6232,0.6095,0.8551,0.4854,0.4984,0.6176,0.652,0.5874,0.9698,0.6511,0.5859,0.9706,0.6604,0.5926,0.9853,0.6556,0.586,0.9846,0.655,0.5943,0.9665
-Llama3-ChatQA-1.5-70B,>65B,0.3666,0.2082,0.1069,0.339,0.169,0.0752,0.3147,0.0148,0.0059,0.2947,0.075,0.0261,0.7758,0.7167,0.9293,0.5528,0.5482,0.4877,0.3396,0.111,0.0507,0.3207,0.0374,0.0156,0.4392,0.3806,0.2524,0.3214,0.0614,0.0253
-Llama-3.1-70B-Instruct,>65B,0.467,0.4105,0.2107,0.3766,0.1681,0.056,0.3856,0.1439,0.0505,0.346,0.1387,0.0392,0.4036,0.2873,0.1107,0.3872,0.1394,0.0487,0.4967,0.4715,0.2711,0.407,0.2331,0.091,0.4985,0.4691,0.2716,0.6337,0.6553,0.5548
-Llama-3.3-70B-Instruct,>65B,0.3996,0.3526,0.2759,0.2923,0.143,0.0771,0.3029,0.142,0.0825,0.2624,0.1066,0.0486,0.3657,0.3253,0.2213,0.3305,0.2121,0.1358,0.4583,0.4388,0.3966,0.3156,0.175,0.1062,0.451,0.4249,0.3802,0.5813,0.5696,0.6459
-DeepSeek-R1-Distill-Llama-70B,>65B,0.424,0.2914,0.1265,0.6148,0.653,0.5255,0.3608,0.0107,0.0033,0.5182,0.5945,0.3588,0.5583,0.5989,0.4156,0.4922,0.4667,0.2664,0.4312,0.3134,0.1401,0.3727,0.0743,0.0243,0.4061,0.2132,0.0844,0.537,0.5522,0.3638
-Yi-1.5-34B-Chat,~30B,0.7139,0.8341,0.5176,0.7722,0.8735,0.6482,0.475,0.2581,0.0357,0.7162,0.8717,0.5603,0.6206,0.7912,0.353,0.8816,0.8938,0.8601,0.6412,0.7813,0.3672,0.497,0.4306,0.0769,0.8472,0.8832,0.7889,0.4818,0.3646,0.0576
-Qwen2.5-32B-Instruct,~30B,0.6749,0.6366,0.7789,0.7893,0.7099,0.9938,0.4372,0.4025,0.2943,0.7921,0.7323,0.9739,0.7723,0.7036,0.9599,0.7702,0.6873,0.9727,0.592,0.5774,0.6092,0.4358,0.3969,0.2906,0.7404,0.6695,0.916,0.464,0.4506,0.3514
-Opt-30B,~30B,0.5831,0.5754,0.5565,0.3952,0.338,0.1915,0.6784,0.6507,0.7506,0.5798,0.6281,0.5559,0.357,0.2405,0.1185,0.406,0.3224,0.1945,0.6203,0.6061,0.633,0.6188,0.6076,0.6293,0.6031,0.5886,0.5976,0.6244,0.6184,0.6415
-QwQ-32B-Preview,~30B,0.5231,0.5061,0.9839,0.5519,0.5328,1,0.4141,0.4443,0.7537,0.5814,0.565,0.9989,0.5529,0.534,0.9993,0.5318,0.5111,0.9993,0.5083,0.4978,0.9542,0.4392,0.4593,0.808,0.5238,0.5042,0.9922,0.5269,0.5128,0.9743
-Mistral-Small-24B-Instruct-2501,~30B,0.5897,0.5714,0.6393,0.7706,0.6931,0.9888,0.3109,0.1339,0.0727,0.7308,0.6984,0.8887,0.7454,0.683,0.9385,0.7584,0.6732,0.9835,0.585,0.5671,0.6297,0.3646,0.2744,0.1803,0.7088,0.645,0.8855,0.3839,0.3257,0.2233
-OpenThinker2-32B,~30B,0.7139 ,0.8341 ,0.5176 ,0.7722 ,0.8735 ,0.6482 ,0.4750 ,0.2581 ,0.0357 ,0.7162 ,0.6749 ,0.6366 ,0.7789 ,0.7893 ,0.7099 ,0.9938 ,0.4372 ,0.4025 ,0.2943 ,0.7921 ,0.5831 ,0.5754 ,0.5565 ,0.3952 ,0.3380 ,0.1915 ,0.6784 ,0.6507 ,0.7506 ,0.5798
-Qwen3-32B,~30B,0.6749 ,0.6366 ,0.7789 ,0.7893 ,0.7099 ,0.9938 ,0.4372 ,0.4025 ,0.2943 ,0.7921 ,0.5831 ,0.5754 ,0.5565 ,0.3952 ,0.3380 ,0.1915 ,0.6784 ,0.6507 ,0.7506 ,0.5798 ,0.5231 ,0.5061 ,0.9839 ,0.5519 ,0.5328 ,1.0000 ,0.4141 ,0.4443 ,0.7537 ,0.5814

 Model,Size,Discrimination_Accuracy,Discrimination_Precision,Discrimination_Recall,Variant_Accuracy,Variant_Precision,Variant_Recall,Psychology_Accuracy,Psychology_Precision,Psychology_Recall,Politics_Accuracy,Politics_Precision,Politics_Recall,Eroticism_Accuracy,Eroticism_Precision,Eroticism_Recall,Vulgarity_Accuracy,Vulgarity_Precision,Vulgarity_Recall,Property_Accuracy,Property_Precision,Property_Recall,Injury_Accuracy,Injury_Precision,Injury_Recall,Criminality_Accuracy,Criminality_Precision,Criminality_Recall,Ethics_Accuracy,Ethics_Precision,Ethics_Recall
+DeepSeek-LLM-67B-Chat,>65B,0.6948,0.9451,0.3989,0.6447,0.9375,0.3259,0.5122,0.5824,0.033,0.7673,0.9695,0.5903,0.6865,0.9516,0.4092,0.899,0.9725,0.8159,0.66,0.9341,0.326,0.5479,0.8184,0.1017,0.8777,0.9706,0.7709,0.5142,0.6736,0.0456
+Qwen1.5-72B-Chat,>65B,0.6479,0.581,0.9985,0.6609,0.6019,0.9938,0.6472,0.5837,0.9906,0.5928,0.5895,0.8276,0.6544,0.5996,0.9796,0.6488,0.5823,0.9987,0.6448,0.5792,0.9932,0.6255,0.5712,0.9493,0.6433,0.5763,0.9951,0.6485,0.5872,0.9874
+Qwen2.5-72B-Instruct,>65B,0.6292,0.6414,0.5480,0.8411,0.7760,0.9689,0.3631,0.0282,0.0086,0.7521,0.7629,0.7894,0.7928,0.7585,0.8742,0.8142,0.7522,0.9248,0.5333,0.5328,0.3499,0.3959,0.1923,0.0723,0.7490,0.7180,0.7928,0.3967,0.2195,0.0826
+Qwen2-72B-Instruct,>65B,0.6587,0.5982,0.9159,0.7064,0.6373,0.9870,0.4112,0.4039,0.4090,0.6611,0.6383,0.8691,0.6920,0.6315,0.9577,0.6948,0.6175,0.9884,0.6106,0.5703,0.8181,0.4184,0.4103,0.4236,0.6658,0.5992,0.9347,0.4887,0.4879,0.5650
+Opt-66B,>65B,0.645,0.5831,0.9572,0.3981,0.417,0.4471,0.6667,0.5971,0.9953,0.6232,0.6095,0.8551,0.4854,0.4984,0.6176,0.652,0.5874,0.9698,0.6511,0.5859,0.9706,0.6604,0.5926,0.9853,0.6556,0.586,0.9846,0.655,0.5943,0.9665
+Llama3-ChatQA-1.5-70B,>65B,0.3666,0.2082,0.1069,0.339,0.169,0.0752,0.3147,0.0148,0.0059,0.2947,0.075,0.0261,0.7758,0.7167,0.9293,0.5528,0.5482,0.4877,0.3396,0.111,0.0507,0.3207,0.0374,0.0156,0.4392,0.3806,0.2524,0.3214,0.0614,0.0253
+Llama-3.1-70B-Instruct,>65B,0.4670,0.4105,0.2107,0.3766,0.1681,0.0560,0.3856,0.1439,0.0505,0.3460,0.1387,0.0392,0.4036,0.2873,0.1107,0.3872,0.1394,0.0487,0.4967,0.4715,0.2711,0.4070,0.2331,0.0910,0.4985,0.4691,0.2716,0.6337,0.6553,0.5548
+Llama-3.3-70B-Instruct,>65B,0.3996,0.3526,0.2759,0.2923,0.1430,0.0771,0.3029,0.1420,0.0825,0.2624,0.1066,0.0486,0.3657,0.3253,0.2213,0.3305,0.2121,0.1358,0.4583,0.4388,0.3966,0.3156,0.1750,0.1062,0.4510,0.4249,0.3802,0.5813,0.5696,0.6459
+DeepSeek-R1-Distill-Llama-70B,>65B,0.4240,0.2914,0.1265,0.6148,0.6530,0.5255,0.3608,0.0107,0.0033,0.5182,0.5945,0.3588,0.5583,0.5989,0.4156,0.4922,0.4667,0.2664,0.4312,0.3134,0.1401,0.3727,0.0743,0.0243,0.4061,0.2132,0.0844,0.5370,0.5522,0.3638
+Yi-1.5-34B-Chat,~30B,0.7139,0.8341,0.5176,0.7722,0.8735,0.6482,0.475,0.2581,0.0357,0.7162,0.8717,0.5603,0.6206,0.7912,0.353,0.8816,0.8938,0.8601,0.6412,0.7813,0.3672,0.497,0.4306,0.0769,0.8472,0.8832,0.7889,0.4818,0.3646,0.0576
+Qwen2.5-32B-Instruct,~30B,0.6749,0.6366,0.7789,0.7893,0.7099,0.9938,0.4372,0.4025,0.2943,0.7921,0.7323,0.9739,0.7723,0.7036,0.9599,0.7702,0.6873,0.9727,0.5920,0.5774,0.6092,0.4358,0.3969,0.2906,0.7404,0.6695,0.9160,0.4640,0.4506,0.3514
+Opt-30B,~30B,0.5831,0.5754,0.5565,0.3952,0.338,0.1915,0.6784,0.6507,0.7506,0.5798,0.6281,0.5559,0.357,0.2405,0.1185,0.406,0.3224,0.1945,0.6203,0.6061,0.633,0.6188,0.6076,0.6293,0.6031,0.5886,0.5976,0.6244,0.6184,0.6415
+QwQ-32B-Preview,~30B,0.5231,0.5061,0.9839,0.5519,0.5328,1.0000,0.4141,0.4443,0.7537,0.5814,0.5650,0.9989,0.5529,0.5340,0.9993,0.5318,0.5111,0.9993,0.5083,0.4978,0.9542,0.4392,0.4593,0.8080,0.5238,0.5042,0.9922,0.5269,0.5128,0.9743
+Mistral-Small-24B-Instruct-2501,~30B,0.5897,0.5714,0.6393,0.7706,0.6931,0.9888,0.3109,0.1339,0.0727,0.7308,0.6984,0.8887,0.7454,0.6830,0.9385,0.7584,0.6732,0.9835,0.5850,0.5671,0.6297,0.3646,0.2744,0.1803,0.7088,0.6450,0.8855,0.3839,0.3257,0.2233
+Baichuan2-13B-Chat,10B~20B,0.7346,0.6715,0.8932,0.7703,0.7043,0.9491,0.6303,0.6129,0.6785,0.7435,0.7152,0.8777,0.779,0.7088,0.9649,0.7677,0.6883,0.9601,0.6763,0.6388,0.7738,0.6359,0.6149,0.6904,0.7096,0.6554,0.8436,0.7306,0.6762,0.8788
+Qwen1.5-14B-Chat,10B~20B,0.625,0.5683,0.964,0.6549,0.5977,0.9932,0.5983,0.5571,0.9038,0.6561,0.6193,0.9535,0.6592,0.6005,0.9994,0.6382,0.5759,0.9897,0.5579,0.53,0.8275,0.5009,0.4938,0.7077,0.6256,0.566,0.9705,0.6063,0.5643,0.914
+Ziya2-13B-Chat,10B~20B,0.6322,0.6632,0.502,0.381,0.0822,0.0212,0.4263,0.2557,0.086,0.4352,0.4474,0.1651,0.612,0.6721,0.4744,0.812,0.7741,0.8691,0.4904,0.4516,0.2102,0.5309,0.5403,0.2964,0.7186,0.7235,0.6777,0.4811,0.4512,0.2021
+InternLM2-Chat-20B,10B~20B,0.5184,0.5912,0.0441,0.4754,0.0222,0.0006,0.4929,0.0222,0.0006,0.4744,0.7043,0.0573,0.605,0.904,0.256,0.5265,0.6774,0.0625,0.5689,0.8292,0.146,0.5046,0.4073,0.0202,0.7142,0.9352,0.44,0.498,0.4041,0.0196
+Opt-13B,10B~20B,0.5011,0.0392,0.0015,0.4792,0.0695,0.0018,0.4958,0,0,0.4492,0.237,0.0055,0.4897,0.5438,0.0249,0.4996,0.0333,0.0006,0.5037,0.1931,0.0055,0.5454,0.8065,0.0965,0.5155,0.499,0.0228,0.5016,0.4815,0.0203
+Mistral-Nemo-Instruct-2407,10B~20B,0.6992,0.6359,0.8960,0.7518,0.6773,0.9826,0.6421,0.6067,0.7767,0.7290,0.6896,0.9121,0.7377,0.6719,0.9542,0.7482,0.6611,0.9959,0.6396,0.6014,0.7754,0.6045,0.5803,0.7019,0.7246,0.6464,0.9529,0.4910,0.4881,0.4717
+Phi-3-medium-4k-instruct,10B~20B,0.8162,0.7447,0.9484,0.3950,0.2748,0.1126,0.8368,0.7558,0.9878,0.5763,0.6486,0.4809,0.6431,0.6695,0.5981,0.8403,0.7549,0.9973,0.8092,0.7414,0.9343,0.8263,0.7504,0.9679,0.8352,0.7499,0.9896,0.6361,0.6499,0.5818
+Phi-3-medium-128k-instruct,10B~20B,0.8024,0.7318,0.9391,0.3592,0.1596,0.0598,0.8232,0.7434,0.9790,0.5228,0.5910,0.3977,0.5699,0.6022,0.4725,0.8293,0.7436,0.9939,0.7813,0.7222,0.8963,0.8009,0.7328,0.9351,0.8260,0.7393,0.9898,0.6525,0.6565,0.6327
+phi-4,10B~20B,0.6193,0.6166,0.5816,0.4118,0.3517,0.1792,0.7011,0.6785,0.7484,0.7224,0.7291,0.7791,0.6152,0.6372,0.5775,0.7375,0.6960,0.8232,0.5775,0.5779,0.4961,0.6685,0.6560,0.6821,0.7074,0.6752,0.7638,0.4629,0.4356,0.2692
+Moonlight-16B-A3B-Instruct,10B~20B,0.5041,0.0556,0.0006,0.4814,0.0000,0.0000,0.4992,0.0000,0.0000,0.4500,0.1369,0.0016,0.4804,0.0256,0.0007,0.5027,0.0000,0.0000,0.5054,0.0893,0.0020,0.5020,0.0972,0.0014,0.5080,0.0256,0.0007,0.4947,0.0000,0.0000
 Gemma-1.1-7B-it,5B~10B,0.6885,0.6193,0.9389,0.7201,0.6502,0.9795,0.6709,0.6133,0.8985,0.7171,0.6709,0.9421,0.5993,0.5861,0.7426,0.7164,0.634,0.9953,0.6316,0.5872,0.8235,0.5207,0.5098,0.595,0.6874,0.616,0.9415,0.6164,0.5853,0.7856
 Qwen1.5-7B-Chat,5B~10B,0.6415,0.5933,0.8439,0.7295,0.6542,0.9987,0.5495,0.5352,0.6535,0.7415,0.6808,0.9875,0.7286,0.6545,0.9955,0.7167,0.6339,0.9966,0.6122,0.5749,0.784,0.4866,0.4788,0.5265,0.6887,0.6165,0.9449,0.4276,0.4219,0.4072
 Yi-1.5-9B-Chat,5B~10B,0.7089,0.8612,0.4825,0.5418,0.7129,0.1741,0.4846,0.2932,0.0308,0.5376,0.7743,0.2115,0.6185,0.8236,0.3254,0.818,0.9011,0.7057,0.5819,0.7416,0.2207,0.4893,0.3279,0.0365,0.7959,0.8937,0.6572,0.477,0.2414,0.0233
 Opt-6.7B,5B~10B,0.5189,0.5038,0.9645,0.3756,0.4266,0.6456,0.5227,0.5083,0.9638,0.549,0.5504,0.9314,0.2606,0.3276,0.4205,0.4833,0.4847,0.8892,0.5274,0.508,0.9831,0.5244,0.508,0.971,0.5105,0.4973,0.9551,0.5322,0.5159,0.9757
 Mistral-7B-Instruct-v0.3,5B~10B,0.4091,0.3399,0.2241,0.3013,0.0672,0.0286,0.3093,0.0548,0.0246,0.3554,0.3176,0.1618,0.4671,0.473,0.3538,0.62,0.6022,0.655,0.432,0.3832,0.2701,0.3362,0.1517,0.0771,0.6338,0.6081,0.6844,0.3814,0.2943,0.1744
 Llama3-ChatQA-1.5-8B,5B~10B,0.387,0.2816,0.1665,0.3232,0.1355,0.0603,0.3054,0.011,0.0045,0.292,0.0948,0.0354,0.7946,0.7193,0.9821,0.5375,0.5306,0.4746,0.3702,0.2367,0.1312,0.318,0.0621,0.0276,0.4823,0.4562,0.3594,0.3398,0.1632,0.0793
+Ministral-8B-Instruct-2410,5B~10B,0.6080,0.5720,0.7836,0.7143,0.6458,0.9763,0.3260,0.2659,0.2081,0.6623,0.6438,0.8483,0.7052,0.6425,0.9570,0.7069,0.6278,0.9849,0.5197,0.5064,0.6030,0.3152,0.2420,0.1855,0.6558,0.5975,0.8847,0.4132,0.4054,0.3860
+Phi-3-small-8k-instruct,5B~10B,0.6365,0.5771,0.9543,0.4834,0.4955,0.6276,0.6479,0.5862,0.9722,0.6323,0.6122,0.8839,0.6153,0.5820,0.8846,0.6563,0.5881,0.9939,0.5791,0.5445,0.8380,0.6012,0.5593,0.8793,0.6322,0.5723,0.9513,0.4856,0.4872,0.6404
+Phi-3-small-128k-instruct,5B~10B,0.6085,0.5851,0.6810,0.3324,0.2336,0.1343,0.7347,0.6638,0.9355,0.6062,0.6315,0.6625,0.6078,0.6056,0.6736,0.7148,0.6513,0.8975,0.6468,0.6108,0.7597,0.7331,0.6615,0.9338,0.7076,0.6437,0.8871,0.4432,0.4270,0.3467