Chumor-leaderboard

Running

App Files Files Community

Chumor-leaderboard / utils.py

dnaihao

Modified page info.

0d78329 7 months ago

raw

history blame contribute delete

7.79 kB

	import pandas as pd
	import gradio as gr
	import csv
	import json
	import os
	import shutil
	from huggingface_hub import Repository

	HF_TOKEN = os.environ.get("HF_TOKEN")


	MODEL_INFO = [
	"Models", "Model Size(B)", "Data Source",
	"DP Acc", "DP False Positive Rate", "DP False Negative Score", "DP MCC",
	"CoT Acc", "CoT False Positive Rate", "CoT False Negative Score", "CoT MCC"
	]


	DATA_TITLE_TYPE = ['markdown', 'str', 'markdown',
	'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number']

	SUBMISSION_NAME = "Chumor-submissions"
	SUBMISSION_URL = os.path.join("https://huggingface.co/datasets/dnaihao", SUBMISSION_NAME)
	CSV_DIR = "./Chumor-submissions/result.csv"

	COLUMN_NAMES = MODEL_INFO

	LEADERBOARD_INTRODUCTION = """# Chumor Leaderboard

	## Introduction
	We construct Chumor, the first Chinese humor explanation dataset that exceeds the size of existing humor datasets. Chumor is sourced from Ruo Zhi Ba (弱智吧), a Chinese Reddit-like platform known for sharing intellectually challenging and culturally specific jokes.


	## What's new about Chumor

	Unlike existing datasets that focus on tasks such as humor detection, punchline identification, or humor generation, Chumor addresses the challenge of humor explanation. This involves not just identifying humor but understanding the reasoning behind it, a task that requires both linguistic and cultural knowledge. Specifically, Chumor tasks the LLMs with determining whether an explanation fully explains the joke. We source the explanations from GPT-4o and ERNIE-4-turbo, and have the entire dataset manually annotated by five native Chinese speakers.

	For detailed information about the dataset, visit our page on Hugging Face: https://huggingface.co/datasets/dnaihao/Chumor.

	If you are interested in replicating these results or wish to evaluate your models using our dataset, access our evaluation scripts available on GitHub: https://github.com/dnaihao/Chumor-dataset.

	If you would like to learn more details about our dataset, please check out our paper: https://arxiv.org/pdf/2406.12754; https://arxiv.org/pdf/2412.17729.

	Below you can find the accuracies of different models tested on this dataset.

	### Acknowledgements

	We construct the leaderboard based on the templated by https://huggingface.co/spaces/TIGER-Lab/MMLU-Pro.

	"""

	TABLE_INTRODUCTION = """
	"""

	LEADERBOARD_INFO = """
	## Dataset Summary
	- Questions and Labels: The task is to decide whether the provided explanation fully explains the joke (good) or does not fully explain the joke (bad).
	- Sources:
	- Jokes: We construct our dataset by including RZB jokes from "Best Annual Threads" between 2018 and 2021 that have been previously crawled (https://github.com/Leymore/ruozhiba). In addition, we directly collect all threads in the "Moderator's Recommendation" section from RZB.
	- Explanations: We source the explanations from GPT-4o and ERNIE-4-turbo.
	- Annotations: We manually annotate the generated explanations as either "fully explain the joke" (good) or "partially explain or not explain the joke" (bad). The gold label is determined by the majority vote among five native Chinese speakers.
	"""

	CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
	CITATION_BUTTON_TEXT = r"""
	@article{he2024chumor,
	title={Chumor 1.0: A Truly Funny and Challenging Chinese Humor Understanding Dataset from Ruo Zhi Ba},
	author={He, Ruiqi and He, Yushu and Bai, Longju and Liu, Jiarui and Sun, Zhenjie and Tang, Zenghao and Wang, He and Xia, Hanchen and Deng, Naihao},
	journal={arXiv preprint arXiv:2406.12754},
	year={2024}
	}

	@misc{he2024chumor20benchmarkingchinese,
	title={Chumor 2.0: Towards Benchmarking Chinese Humor Understanding},
	author={Ruiqi He and Yushu He and Longju Bai and Jiarui Liu and Zhenjie Sun and Zenghao Tang and He Wang and Hanchen Xia and Rada Mihalcea and Naihao Deng},
	year={2024},
	eprint={2412.17729},
	archivePrefix={arXiv},
	primaryClass={cs.CL},
	url={https://arxiv.org/abs/2412.17729},
	}
	"""

	SUBMIT_INTRODUCTION = """# Submit on MMLU-Pro Leaderboard Introduction

	## ⚠ Please note that you need to submit the CSV file with the following format:

	```csv
	labels
	good
	good
	bad
	...
	```

	You can generate an output file in the above format using the evaluation script provided in our GitHub repository. For your convenience, the script and detailed instructions are available at GitHub: https://github.com/dnaihao/Chumor-dataset. After generating the file, please send us an email at [email protected], attaching the output file.
	"""


	def get_df():
	repo = Repository(local_dir=SUBMISSION_NAME, clone_from=SUBMISSION_URL, use_auth_token=HF_TOKEN)
	repo.git_pull()
	df = pd.read_csv(CSV_DIR)
	df['Model Size(B)'] = df['Model Size(B)'].apply(process_model_size)
	df = df.sort_values(by=['DP Acc'], ascending=False)
	return df


	def add_new_eval(
	input_file,
	):
	if input_file is None:
	return "Error! Empty file!"

	upload_data = json.loads(input_file)
	print("upload_data:\n", upload_data)
	data_row = [f'{upload_data["Model"]}', upload_data['DP Acc']]
	print("data_row:\n", data_row)
	submission_repo = Repository(local_dir=SUBMISSION_NAME, clone_from=SUBMISSION_URL,
	use_auth_token=HF_TOKEN, repo_type="dataset")
	submission_repo.git_pull()

	already_submitted = []
	with open(CSV_DIR, mode='r') as file:
	reader = csv.reader(file, delimiter=',')
	for row in reader:
	already_submitted.append(row[0])

	if data_row[0] not in already_submitted:
	with open(CSV_DIR, mode='a', newline='') as file:
	writer = csv.writer(file)
	writer.writerow(data_row)

	submission_repo.push_to_hub()
	print('Submission Successful')
	else:
	print('The entry already exists')

	def refresh_data():
	df = get_df()
	return df[COLUMN_NAMES]


	def search_and_filter_models(df, query, min_size, max_size):
	filtered_df = df.copy()

	if query:
	filtered_df = filtered_df[filtered_df['Models'].str.contains(query, case=False, na=False)]

	size_mask = filtered_df['Model Size(B)'].apply(lambda x:
	(min_size <= 1000.0 <= max_size) if x == 'unknown' or x == '-' or x == 'unk'
	else (min_size <= x <= max_size))

	filtered_df = filtered_df[size_mask]

	return filtered_df[COLUMN_NAMES]


	# def search_and_filter_models(df, query, min_size, max_size):
	# filtered_df = df.copy()

	# if query:
	# filtered_df = filtered_df[filtered_df['Models'].str.contains(query, case=False, na=False)]

	# def size_filter(x):
	# if isinstance(x, (int, float)):
	# return min_size <= x <= max_size
	# return True

	# filtered_df = filtered_df[filtered_df['Model Size(B)'].apply(size_filter)]

	# return filtered_df[COLUMN_NAMES]


	def search_models(df, query):
	if query:
	return df[df['Models'].str.contains(query, case=False, na=False)]
	return df


	# def get_size_range(df):
	# numeric_sizes = df[df['Model Size(B)'].apply(lambda x: isinstance(x, (int, float)))]['Model Size(B)']
	# if len(numeric_sizes) > 0:
	# return float(numeric_sizes.min()), float(numeric_sizes.max())
	# return 0, 1000


	def get_size_range(df):
	sizes = df['Model Size(B)'].apply(lambda x: 1000.0 if x == 'unknown' or x == '-' or x == 'unk' else x)
	return float(sizes.min()), float(sizes.max())


	def process_model_size(size):
	if pd.isna(size) or size == 'unk' or size == "-":
	return 'unknown'
	try:
	val = float(size)
	return val
	except (ValueError, TypeError):
	return 'unknown'