Jay
commited on
Commit
Β·
b93bb99
1
Parent(s):
6fc8478
doc: update changelog
Browse files- app.py +3 -6
- assets/text.py +3 -4
- changelog.md +12 -1
app.py
CHANGED
|
@@ -15,7 +15,6 @@ ORIGINAL_DF_SUB_PER = pd.read_csv("./data/subclass_per.csv", encoding='utf-8')
|
|
| 15 |
ORIGINAL_DF_NEW = pd.read_csv("./data/ChineseGuardBench.csv", encoding='utf-8') # new table
|
| 16 |
|
| 17 |
METRICS = ["Accuracy", "Precision_Unsafe", "Recall_Unsafe", "Precision_Safe", "Recall_Safe", "None"]
|
| 18 |
-
# METRICS_GuardBench = ["Accuracy", "Precision_Unsafe", "Recall_Unsafe", "Precision_Safe", "Recall_Safe", "None"]
|
| 19 |
|
| 20 |
SUBCLASS = ["Discrimination", "Variant", "Psychology", "Politics", "Eroticism", "Vulgarity", "Property", "Injury", "Criminality", "Ethics"]
|
| 21 |
|
|
@@ -165,9 +164,6 @@ def get_ChineseGuardBench(
|
|
| 165 |
main_choice: List[str],
|
| 166 |
):
|
| 167 |
leaderboard_table = get_dataset_new_csv(model_size)
|
| 168 |
-
# elif main_choice != "Subclass":
|
| 169 |
-
# subclass_choice = main_choice
|
| 170 |
-
# leaderboard_table = get_dataset_csv_sub_gen(model_size, subclass_choice)
|
| 171 |
return leaderboard_table
|
| 172 |
|
| 173 |
|
|
@@ -216,12 +212,12 @@ with gr.Blocks() as demo:
|
|
| 216 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
| 217 |
# ----------------- modify text -----------------
|
| 218 |
|
| 219 |
-
with gr.TabItem("π
Generation", elem_id="od-benchmark-tab-table", id=
|
| 220 |
dataframe_all_gen = gr.components.Dataframe(
|
| 221 |
elem_id="leaderboard-table",
|
| 222 |
)
|
| 223 |
|
| 224 |
-
with gr.TabItem("π
Perplexity", elem_id="od-benchmark-tab-table", id=
|
| 225 |
dataframe_all_per = gr.components.Dataframe(
|
| 226 |
elem_id="leaderboard-table",
|
| 227 |
)
|
|
@@ -292,6 +288,7 @@ with gr.Blocks() as demo:
|
|
| 292 |
)
|
| 293 |
|
| 294 |
# this is new results for ChineseGuardBench
|
|
|
|
| 295 |
# main_choice.change(
|
| 296 |
# get_ChineseGuardBench,
|
| 297 |
# inputs=[model_choice, main_choice],
|
|
|
|
| 15 |
ORIGINAL_DF_NEW = pd.read_csv("./data/ChineseGuardBench.csv", encoding='utf-8') # new table
|
| 16 |
|
| 17 |
METRICS = ["Accuracy", "Precision_Unsafe", "Recall_Unsafe", "Precision_Safe", "Recall_Safe", "None"]
|
|
|
|
| 18 |
|
| 19 |
SUBCLASS = ["Discrimination", "Variant", "Psychology", "Politics", "Eroticism", "Vulgarity", "Property", "Injury", "Criminality", "Ethics"]
|
| 20 |
|
|
|
|
| 164 |
main_choice: List[str],
|
| 165 |
):
|
| 166 |
leaderboard_table = get_dataset_new_csv(model_size)
|
|
|
|
|
|
|
|
|
|
| 167 |
return leaderboard_table
|
| 168 |
|
| 169 |
|
|
|
|
| 212 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
| 213 |
# ----------------- modify text -----------------
|
| 214 |
|
| 215 |
+
with gr.TabItem("π
Generation", elem_id="od-benchmark-tab-table", id=5):
|
| 216 |
dataframe_all_gen = gr.components.Dataframe(
|
| 217 |
elem_id="leaderboard-table",
|
| 218 |
)
|
| 219 |
|
| 220 |
+
with gr.TabItem("π
Perplexity", elem_id="od-benchmark-tab-table", id=6):
|
| 221 |
dataframe_all_per = gr.components.Dataframe(
|
| 222 |
elem_id="leaderboard-table",
|
| 223 |
)
|
|
|
|
| 288 |
)
|
| 289 |
|
| 290 |
# this is new results for ChineseGuardBench
|
| 291 |
+
|
| 292 |
# main_choice.change(
|
| 293 |
# get_ChineseGuardBench,
|
| 294 |
# inputs=[model_choice, main_choice],
|
assets/text.py
CHANGED
|
@@ -34,14 +34,13 @@ EVALUTION_TEXT= """
|
|
| 34 |
<span style="font-size:16px; font-family: 'Times New Roman', serif">
|
| 35 |
We evaluate the models using two methods: perplexity(multiple choice) and generation.
|
| 36 |
For perplexity, we select the label which is the lowest perplexity as the predicted results.
|
| 37 |
-
For generation, we use the content generated by the model to make prediction.
|
| 38 |
-
|
| 39 |
-
|
| 40 |
</span> <br><br>
|
| 41 |
|
| 42 |
|
| 43 |
""" # noqa
|
| 44 |
-
|
| 45 |
REFERENCE_TEXT = """
|
| 46 |
# References
|
| 47 |
<span style="font-size:16px; font-family: 'Times New Roman', serif">
|
|
|
|
| 34 |
<span style="font-size:16px; font-family: 'Times New Roman', serif">
|
| 35 |
We evaluate the models using two methods: perplexity(multiple choice) and generation.
|
| 36 |
For perplexity, we select the label which is the lowest perplexity as the predicted results.
|
| 37 |
+
For generation, we use the content generated by the model to make prediction.
|
| 38 |
+
In the "New" table, we present evaluation results for generation on a meticulously curated new benchmark, with details of its processing to be introduced later.
|
| 39 |
+
The following are the results of the evaluation.πππ
|
| 40 |
</span> <br><br>
|
| 41 |
|
| 42 |
|
| 43 |
""" # noqa
|
|
|
|
| 44 |
REFERENCE_TEXT = """
|
| 45 |
# References
|
| 46 |
<span style="font-size:16px; font-family: 'Times New Roman', serif">
|
changelog.md
CHANGED
|
@@ -1,5 +1,6 @@
|
|
| 1 |
# CHANGELOG
|
| 2 |
|
|
|
|
| 3 |
### 2024-7-16
|
| 4 |
version: v1.0.0
|
| 5 |
|
|
@@ -66,4 +67,14 @@ version: v1.0.6
|
|
| 66 |
- Deepseek-chat-v3-0324
|
| 67 |
- Qwen3
|
| 68 |
- Gemma-3
|
| 69 |
-
- OpenThinker2
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
# CHANGELOG
|
| 2 |
|
| 3 |
+
|
| 4 |
### 2024-7-16
|
| 5 |
version: v1.0.0
|
| 6 |
|
|
|
|
| 67 |
- Deepseek-chat-v3-0324
|
| 68 |
- Qwen3
|
| 69 |
- Gemma-3
|
| 70 |
+
- OpenThinker2
|
| 71 |
+
|
| 72 |
+
### 2025-7-29
|
| 73 |
+
version: v1.0.7
|
| 74 |
+
|
| 75 |
+
changed:
|
| 76 |
+
- [1]feat: Update the two models required by Deepexi.
|
| 77 |
+
- Deepexi-Guard-3B
|
| 78 |
+
- Qwen2.5-3B-Instruct
|
| 79 |
+
|
| 80 |
+
- [2]feat: Update a new table ChineseGuardBench required by Deepxi.
|