File size: 7,789 Bytes
e353a82 3388e82 e353a82 c66f0bf 5d9b035 3388e82 5d9b035 3388e82 609fd32 3db71ca e353a82 5d9b035 3388e82 f935a66 a51d3f7 f935a66 3388e82 a050217 f61bd87 a51d3f7 11c0693 a51d3f7 4cdb30d a51d3f7 4cdb30d 0d78329 4cdb30d f61bd87 f935a66 a51d3f7 86c5f36 f61bd87 6e69c2a e353a82 f935a66 6e69c2a 208c50b f935a66 0d78329 f935a66 e353a82 965f42a e353a82 a51d3f7 e353a82 a51d3f7 e353a82 3388e82 e353a82 a7c29bf c825a8f a7c29bf e353a82 3388e82 e353a82 3388e82 bb25558 f2f99bf bb25558 3388e82 e353a82 324387b 3388e82 ddc4b67 1f5abbf ddc4b67 f2f99bf ddc4b67 324387b ddc4b67 324387b ddc4b67 324387b ddc4b67 c66f0bf ddc4b67 1f5abbf ddc4b67 324387b ddc4b67 1f5abbf 617d783 3388e82 1f5abbf ddc4b67 1f5abbf f2f99bf ddc4b67 a7c29bf 5d9b035 67f06d3 a7c29bf 324387b 67f06d3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 |
import pandas as pd
import gradio as gr
import csv
import json
import os
import shutil
from huggingface_hub import Repository
HF_TOKEN = os.environ.get("HF_TOKEN")
MODEL_INFO = [
"Models", "Model Size(B)", "Data Source",
"DP Acc", "DP False Positive Rate", "DP False Negative Score", "DP MCC",
"CoT Acc", "CoT False Positive Rate", "CoT False Negative Score", "CoT MCC"
]
DATA_TITLE_TYPE = ['markdown', 'str', 'markdown',
'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number']
SUBMISSION_NAME = "Chumor-submissions"
SUBMISSION_URL = os.path.join("https://huggingface.co/datasets/dnaihao", SUBMISSION_NAME)
CSV_DIR = "./Chumor-submissions/result.csv"
COLUMN_NAMES = MODEL_INFO
LEADERBOARD_INTRODUCTION = """# Chumor Leaderboard
## Introduction
We construct Chumor, the first Chinese humor explanation dataset that exceeds the size of existing humor datasets. Chumor is sourced from Ruo Zhi Ba (弱智吧), a Chinese Reddit-like platform known for sharing intellectually challenging and culturally specific jokes.
## What's new about Chumor
Unlike existing datasets that focus on tasks such as humor detection, punchline identification, or humor generation, Chumor addresses the challenge of humor explanation. This involves not just identifying humor but understanding the reasoning behind it, a task that requires both linguistic and cultural knowledge. Specifically, Chumor tasks the LLMs with determining whether an explanation fully explains the joke. We source the explanations from GPT-4o and ERNIE-4-turbo, and have the entire dataset manually annotated by five native Chinese speakers.
For detailed information about the dataset, visit our page on Hugging Face: https://huggingface.co/datasets/dnaihao/Chumor.
If you are interested in replicating these results or wish to evaluate your models using our dataset, access our evaluation scripts available on GitHub: https://github.com/dnaihao/Chumor-dataset.
If you would like to learn more details about our dataset, please check out our paper: https://arxiv.org/pdf/2406.12754; https://arxiv.org/pdf/2412.17729.
Below you can find the accuracies of different models tested on this dataset.
### Acknowledgements
We construct the leaderboard based on the templated by https://huggingface.co/spaces/TIGER-Lab/MMLU-Pro.
"""
TABLE_INTRODUCTION = """
"""
LEADERBOARD_INFO = """
## Dataset Summary
- **Questions and Labels:** The task is to decide whether the provided explanation fully explains the joke (good) or does not fully explain the joke (bad).
- **Sources:**
- **Jokes:** We construct our dataset by including RZB jokes from "Best Annual Threads" between 2018 and 2021 that have been previously crawled (https://github.com/Leymore/ruozhiba). In addition, we directly collect all threads in the "Moderator's Recommendation" section from RZB.
- **Explanations:** We source the explanations from GPT-4o and ERNIE-4-turbo.
- **Annotations:** We manually annotate the generated explanations as either "fully explain the joke" (good) or "partially explain or not explain the joke" (bad). The gold label is determined by the majority vote among five native Chinese speakers.
"""
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
CITATION_BUTTON_TEXT = r"""
@article{he2024chumor,
title={Chumor 1.0: A Truly Funny and Challenging Chinese Humor Understanding Dataset from Ruo Zhi Ba},
author={He, Ruiqi and He, Yushu and Bai, Longju and Liu, Jiarui and Sun, Zhenjie and Tang, Zenghao and Wang, He and Xia, Hanchen and Deng, Naihao},
journal={arXiv preprint arXiv:2406.12754},
year={2024}
}
@misc{he2024chumor20benchmarkingchinese,
title={Chumor 2.0: Towards Benchmarking Chinese Humor Understanding},
author={Ruiqi He and Yushu He and Longju Bai and Jiarui Liu and Zhenjie Sun and Zenghao Tang and He Wang and Hanchen Xia and Rada Mihalcea and Naihao Deng},
year={2024},
eprint={2412.17729},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2412.17729},
}
"""
SUBMIT_INTRODUCTION = """# Submit on MMLU-Pro Leaderboard Introduction
## ⚠ Please note that you need to submit the CSV file with the following format:
```csv
labels
good
good
bad
...
```
You can generate an output file in the above format using the evaluation script provided in our GitHub repository. For your convenience, the script and detailed instructions are available at GitHub: https://github.com/dnaihao/Chumor-dataset. After generating the file, please send us an email at [email protected], attaching the output file.
"""
def get_df():
repo = Repository(local_dir=SUBMISSION_NAME, clone_from=SUBMISSION_URL, use_auth_token=HF_TOKEN)
repo.git_pull()
df = pd.read_csv(CSV_DIR)
df['Model Size(B)'] = df['Model Size(B)'].apply(process_model_size)
df = df.sort_values(by=['DP Acc'], ascending=False)
return df
def add_new_eval(
input_file,
):
if input_file is None:
return "Error! Empty file!"
upload_data = json.loads(input_file)
print("upload_data:\n", upload_data)
data_row = [f'{upload_data["Model"]}', upload_data['DP Acc']]
print("data_row:\n", data_row)
submission_repo = Repository(local_dir=SUBMISSION_NAME, clone_from=SUBMISSION_URL,
use_auth_token=HF_TOKEN, repo_type="dataset")
submission_repo.git_pull()
already_submitted = []
with open(CSV_DIR, mode='r') as file:
reader = csv.reader(file, delimiter=',')
for row in reader:
already_submitted.append(row[0])
if data_row[0] not in already_submitted:
with open(CSV_DIR, mode='a', newline='') as file:
writer = csv.writer(file)
writer.writerow(data_row)
submission_repo.push_to_hub()
print('Submission Successful')
else:
print('The entry already exists')
def refresh_data():
df = get_df()
return df[COLUMN_NAMES]
def search_and_filter_models(df, query, min_size, max_size):
filtered_df = df.copy()
if query:
filtered_df = filtered_df[filtered_df['Models'].str.contains(query, case=False, na=False)]
size_mask = filtered_df['Model Size(B)'].apply(lambda x:
(min_size <= 1000.0 <= max_size) if x == 'unknown' or x == '-' or x == 'unk'
else (min_size <= x <= max_size))
filtered_df = filtered_df[size_mask]
return filtered_df[COLUMN_NAMES]
# def search_and_filter_models(df, query, min_size, max_size):
# filtered_df = df.copy()
# if query:
# filtered_df = filtered_df[filtered_df['Models'].str.contains(query, case=False, na=False)]
# def size_filter(x):
# if isinstance(x, (int, float)):
# return min_size <= x <= max_size
# return True
# filtered_df = filtered_df[filtered_df['Model Size(B)'].apply(size_filter)]
# return filtered_df[COLUMN_NAMES]
def search_models(df, query):
if query:
return df[df['Models'].str.contains(query, case=False, na=False)]
return df
# def get_size_range(df):
# numeric_sizes = df[df['Model Size(B)'].apply(lambda x: isinstance(x, (int, float)))]['Model Size(B)']
# if len(numeric_sizes) > 0:
# return float(numeric_sizes.min()), float(numeric_sizes.max())
# return 0, 1000
def get_size_range(df):
sizes = df['Model Size(B)'].apply(lambda x: 1000.0 if x == 'unknown' or x == '-' or x == 'unk' else x)
return float(sizes.min()), float(sizes.max())
def process_model_size(size):
if pd.isna(size) or size == 'unk' or size == "-":
return 'unknown'
try:
val = float(size)
return val
except (ValueError, TypeError):
return 'unknown'
|