File size: 7,789 Bytes
e353a82
 
 
 
 
 
 
 
 
 
3388e82
e353a82
c66f0bf
5d9b035
 
 
 
3388e82
5d9b035
 
3388e82
609fd32
 
3db71ca
e353a82
 
 
5d9b035
3388e82
f935a66
a51d3f7
f935a66
3388e82
a050217
f61bd87
a51d3f7
11c0693
a51d3f7
4cdb30d
a51d3f7
4cdb30d
0d78329
4cdb30d
f61bd87
f935a66
a51d3f7
 
 
 
86c5f36
 
 
 
 
 
f61bd87
6e69c2a
 
 
 
 
e353a82
 
 
f935a66
6e69c2a
 
 
 
208c50b
f935a66
0d78329
 
 
 
 
 
 
 
 
 
f935a66
e353a82
965f42a
e353a82
a51d3f7
 
 
 
 
 
 
 
e353a82
a51d3f7
 
e353a82
3388e82
 
e353a82
 
 
 
a7c29bf
c825a8f
a7c29bf
e353a82
3388e82
e353a82
 
 
 
 
 
3388e82
bb25558
f2f99bf
bb25558
3388e82
 
e353a82
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
324387b
 
 
3388e82
ddc4b67
 
 
 
 
1f5abbf
ddc4b67
f2f99bf
ddc4b67
324387b
ddc4b67
324387b
ddc4b67
324387b
 
ddc4b67
 
 
 
 
c66f0bf
ddc4b67
 
 
 
1f5abbf
ddc4b67
324387b
ddc4b67
1f5abbf
 
617d783
 
 
 
3388e82
1f5abbf
ddc4b67
 
 
 
 
 
 
1f5abbf
f2f99bf
ddc4b67
a7c29bf
 
 
5d9b035
67f06d3
a7c29bf
324387b
 
 
67f06d3
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
import pandas as pd
import gradio as gr
import csv
import json
import os
import shutil
from huggingface_hub import Repository

HF_TOKEN = os.environ.get("HF_TOKEN")


MODEL_INFO = [
    "Models", "Model Size(B)", "Data Source",
    "DP Acc", "DP False Positive Rate", "DP False Negative Score", "DP MCC",
    "CoT Acc", "CoT False Positive Rate", "CoT False Negative Score", "CoT MCC"
]


DATA_TITLE_TYPE = ['markdown', 'str', 'markdown',
                   'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number']

SUBMISSION_NAME = "Chumor-submissions"
SUBMISSION_URL = os.path.join("https://huggingface.co/datasets/dnaihao", SUBMISSION_NAME)
CSV_DIR = "./Chumor-submissions/result.csv"

COLUMN_NAMES = MODEL_INFO

LEADERBOARD_INTRODUCTION = """# Chumor Leaderboard

## Introduction
We construct Chumor, the first Chinese humor explanation dataset that exceeds the size of existing humor datasets. Chumor is sourced from Ruo Zhi Ba (弱智吧), a Chinese Reddit-like platform known for sharing intellectually challenging and culturally specific jokes.


## What's new about Chumor

Unlike existing datasets that focus on tasks such as humor detection, punchline identification, or humor generation, Chumor addresses the challenge of humor explanation. This involves not just identifying humor but understanding the reasoning behind it, a task that requires both linguistic and cultural knowledge. Specifically, Chumor tasks the LLMs with determining whether an explanation fully explains the joke. We source the explanations from GPT-4o and ERNIE-4-turbo, and have the entire dataset manually annotated by five native Chinese speakers.

For detailed information about the dataset, visit our page on Hugging Face:  https://huggingface.co/datasets/dnaihao/Chumor. 

If you are interested in replicating these results or wish to evaluate your models using our dataset, access our evaluation scripts available on GitHub: https://github.com/dnaihao/Chumor-dataset.

If you would like to learn more details about our dataset, please check out our paper: https://arxiv.org/pdf/2406.12754; https://arxiv.org/pdf/2412.17729.

Below you can find the accuracies of different models tested on this dataset.

### Acknowledgements

We construct the leaderboard based on the templated by https://huggingface.co/spaces/TIGER-Lab/MMLU-Pro.

"""

TABLE_INTRODUCTION = """
    """

LEADERBOARD_INFO = """
## Dataset Summary
- **Questions and Labels:** The task is to decide whether the provided explanation fully explains the joke (good) or does not fully explain the joke (bad).
- **Sources:**
  - **Jokes:** We construct our dataset by including RZB jokes from "Best Annual Threads" between 2018 and 2021 that have been previously crawled (https://github.com/Leymore/ruozhiba). In addition, we directly collect all threads in the "Moderator's Recommendation" section from RZB.
  - **Explanations:** We source the explanations from GPT-4o and ERNIE-4-turbo.
  - **Annotations:** We manually annotate the generated explanations as either "fully explain the joke" (good) or "partially explain or not explain the joke" (bad). The gold label is determined by the majority vote among five native Chinese speakers.
"""

CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
CITATION_BUTTON_TEXT = r"""
@article{he2024chumor,
  title={Chumor 1.0: A Truly Funny and Challenging Chinese Humor Understanding Dataset from Ruo Zhi Ba},
  author={He, Ruiqi and He, Yushu and Bai, Longju and Liu, Jiarui and Sun, Zhenjie and Tang, Zenghao and Wang, He and Xia, Hanchen and Deng, Naihao},
  journal={arXiv preprint arXiv:2406.12754},
  year={2024}
}

@misc{he2024chumor20benchmarkingchinese,
      title={Chumor 2.0: Towards Benchmarking Chinese Humor Understanding}, 
      author={Ruiqi He and Yushu He and Longju Bai and Jiarui Liu and Zhenjie Sun and Zenghao Tang and He Wang and Hanchen Xia and Rada Mihalcea and Naihao Deng},
      year={2024},
      eprint={2412.17729},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/2412.17729}, 
}
"""

SUBMIT_INTRODUCTION = """# Submit on MMLU-Pro Leaderboard Introduction

## ⚠ Please note that you need to submit the CSV file with the following format:

```csv
labels
good
good
bad
...
```

You can generate an output file in the above format using the evaluation script provided in our GitHub repository. For your convenience, the script and detailed instructions are available at GitHub: https://github.com/dnaihao/Chumor-dataset. After generating the file, please send us an email at [email protected], attaching the output file.
"""


def get_df():
    repo = Repository(local_dir=SUBMISSION_NAME, clone_from=SUBMISSION_URL, use_auth_token=HF_TOKEN)
    repo.git_pull()
    df = pd.read_csv(CSV_DIR)
    df['Model Size(B)'] = df['Model Size(B)'].apply(process_model_size)
    df = df.sort_values(by=['DP Acc'], ascending=False)
    return df


def add_new_eval(
    input_file,
):
    if input_file is None:
        return "Error! Empty file!"

    upload_data = json.loads(input_file)
    print("upload_data:\n", upload_data)
    data_row = [f'{upload_data["Model"]}', upload_data['DP Acc']]
    print("data_row:\n", data_row)
    submission_repo = Repository(local_dir=SUBMISSION_NAME, clone_from=SUBMISSION_URL,
                                 use_auth_token=HF_TOKEN, repo_type="dataset")
    submission_repo.git_pull()

    already_submitted = []
    with open(CSV_DIR, mode='r') as file:
        reader = csv.reader(file, delimiter=',')
        for row in reader:
            already_submitted.append(row[0])

    if data_row[0] not in already_submitted:
        with open(CSV_DIR, mode='a', newline='') as file:
            writer = csv.writer(file)
            writer.writerow(data_row)
        
        submission_repo.push_to_hub()
        print('Submission Successful')
    else:
        print('The entry already exists')

def refresh_data():
    df = get_df()
    return df[COLUMN_NAMES]


def search_and_filter_models(df, query, min_size, max_size):
    filtered_df = df.copy()
    
    if query:
        filtered_df = filtered_df[filtered_df['Models'].str.contains(query, case=False, na=False)]

    size_mask = filtered_df['Model Size(B)'].apply(lambda x: 
        (min_size <= 1000.0 <= max_size) if x == 'unknown' or x == '-' or x == 'unk' 
        else (min_size <= x <= max_size))
    
    filtered_df = filtered_df[size_mask]
    
    return filtered_df[COLUMN_NAMES]


# def search_and_filter_models(df, query, min_size, max_size):
#     filtered_df = df.copy()

#     if query:
#         filtered_df = filtered_df[filtered_df['Models'].str.contains(query, case=False, na=False)]
    
#     def size_filter(x):
#         if isinstance(x, (int, float)):
#             return min_size <= x <= max_size
#         return True 
    
#     filtered_df = filtered_df[filtered_df['Model Size(B)'].apply(size_filter)]
    
#     return filtered_df[COLUMN_NAMES]


def search_models(df, query):
    if query:
        return df[df['Models'].str.contains(query, case=False, na=False)]
    return df


# def get_size_range(df):
#     numeric_sizes = df[df['Model Size(B)'].apply(lambda x: isinstance(x, (int, float)))]['Model Size(B)']
#     if len(numeric_sizes) > 0:
#         return float(numeric_sizes.min()), float(numeric_sizes.max())
#     return 0, 1000


def get_size_range(df):
    sizes = df['Model Size(B)'].apply(lambda x: 1000.0 if x == 'unknown'  or x == '-' or x == 'unk' else x)
    return float(sizes.min()), float(sizes.max())


def process_model_size(size):
    if pd.isna(size) or size == 'unk' or size == "-":
        return 'unknown'
    try:
        val = float(size)
        return val
    except (ValueError, TypeError):
        return 'unknown'