Spaces:

SUSTech
/

ChineseSafe-Benchmark

Running

App Files Files Community

Jay commited on 22 days ago

Commit

b93bb99

1 Parent(s): 6fc8478

doc: update changelog

Browse files

Files changed (3) hide show

app.py +3 -6
assets/text.py +3 -4
changelog.md +12 -1

app.py CHANGED Viewed

@@ -15,7 +15,6 @@ ORIGINAL_DF_SUB_PER = pd.read_csv("./data/subclass_per.csv", encoding='utf-8')
 ORIGINAL_DF_NEW = pd.read_csv("./data/ChineseGuardBench.csv", encoding='utf-8') # new table
 METRICS = ["Accuracy", "Precision_Unsafe", "Recall_Unsafe", "Precision_Safe", "Recall_Safe", "None"]
-# METRICS_GuardBench = ["Accuracy", "Precision_Unsafe", "Recall_Unsafe", "Precision_Safe", "Recall_Safe", "None"]
 SUBCLASS = ["Discrimination", "Variant", "Psychology", "Politics", "Eroticism", "Vulgarity", "Property", "Injury", "Criminality", "Ethics"]
@@ -165,9 +164,6 @@ def get_ChineseGuardBench(
     main_choice: List[str],
 ):
     leaderboard_table = get_dataset_new_csv(model_size)
-    # elif main_choice != "Subclass":
-    #     subclass_choice = main_choice
-    #     leaderboard_table = get_dataset_csv_sub_gen(model_size, subclass_choice)
     return leaderboard_table
@@ -216,12 +212,12 @@ with gr.Blocks() as demo:
     with gr.Tabs(elem_classes="tab-buttons") as tabs:
         # ----------------- modify text -----------------
-        with gr.TabItem("🏅 Generation", elem_id="od-benchmark-tab-table", id=6):
             dataframe_all_gen = gr.components.Dataframe(
                 elem_id="leaderboard-table",
             )
-        with gr.TabItem("🏅 Perplexity", elem_id="od-benchmark-tab-table", id=5):
             dataframe_all_per = gr.components.Dataframe(
                 elem_id="leaderboard-table",
             )
@@ -292,6 +288,7 @@ with gr.Blocks() as demo:
     )
     # this is new results for ChineseGuardBench
     # main_choice.change(
     #     get_ChineseGuardBench,
     #     inputs=[model_choice, main_choice],

 ORIGINAL_DF_NEW = pd.read_csv("./data/ChineseGuardBench.csv", encoding='utf-8') # new table
 METRICS = ["Accuracy", "Precision_Unsafe", "Recall_Unsafe", "Precision_Safe", "Recall_Safe", "None"]
 SUBCLASS = ["Discrimination", "Variant", "Psychology", "Politics", "Eroticism", "Vulgarity", "Property", "Injury", "Criminality", "Ethics"]
     main_choice: List[str],
 ):
     leaderboard_table = get_dataset_new_csv(model_size)
     return leaderboard_table
     with gr.Tabs(elem_classes="tab-buttons") as tabs:
         # ----------------- modify text -----------------
+        with gr.TabItem("🏅 Generation", elem_id="od-benchmark-tab-table", id=5):
             dataframe_all_gen = gr.components.Dataframe(
                 elem_id="leaderboard-table",
             )
+        with gr.TabItem("🏅 Perplexity", elem_id="od-benchmark-tab-table", id=6):
             dataframe_all_per = gr.components.Dataframe(
                 elem_id="leaderboard-table",
             )
     )
     # this is new results for ChineseGuardBench
     # main_choice.change(
     #     get_ChineseGuardBench,
     #     inputs=[model_choice, main_choice],

assets/text.py CHANGED Viewed

@@ -34,14 +34,13 @@ EVALUTION_TEXT= """
 <span style="font-size:16px; font-family: 'Times New Roman', serif">
 We evaluate the models using two methods: perplexity(multiple choice) and generation.
 For perplexity, we select the label which is the lowest perplexity as the predicted results.
-For generation, we use the content generated by the model to make prediction.
-The following are the results of the evaluation.
-In the "New" table, we present evaluation results for generation on a meticulously curated new benchmark, with details of its processing to be introduced later.👇👇👇
 </span> <br><br>
 """ # noqa
 REFERENCE_TEXT = """
 # References
 <span style="font-size:16px; font-family: 'Times New Roman', serif">

 <span style="font-size:16px; font-family: 'Times New Roman', serif">
 We evaluate the models using two methods: perplexity(multiple choice) and generation.
 For perplexity, we select the label which is the lowest perplexity as the predicted results.
+For generation, we use the content generated by the model to make prediction.
+In the "New" table, we present evaluation results for generation on a meticulously curated new benchmark, with details of its processing to be introduced later.
+The following are the results of the evaluation.👇👇👇
 </span> <br><br>
 """ # noqa
 REFERENCE_TEXT = """
 # References
 <span style="font-size:16px; font-family: 'Times New Roman', serif">

changelog.md CHANGED Viewed

@@ -1,5 +1,6 @@
 # CHANGELOG
 ### 2024-7-16
 version: v1.0.0
@@ -66,4 +67,14 @@ version: v1.0.6
         - Deepseek-chat-v3-0324
         - Qwen3
         - Gemma-3
-        - OpenThinker2

 # CHANGELOG
 ### 2024-7-16
 version: v1.0.0
         - Deepseek-chat-v3-0324
         - Qwen3
         - Gemma-3
+        - OpenThinker2
+### 2025-7-29
+version: v1.0.7
+    changed:
+    - [1]feat: Update the two models required by Deepexi.
+        - Deepexi-Guard-3B
+        - Qwen2.5-3B-Instruct
+    - [2]feat: Update a new table ChineseGuardBench required by Deepxi.