from dataclasses import dataclass from enum import Enum @dataclass class Task: benchmark: str metric: str col_name: str # Select your tasks here # --------------------------------------------------- class Tasks(Enum): # task_key in the json file, metric_key in the json file, name to display in the leaderboard task0 = Task("anli_r1", "acc", "ANLI") task1 = Task("logiqa", "acc_norm", "LogiQA") NUM_FEWSHOT = 0 # Change with your few shot # --------------------------------------------------- # Your leaderboard name TITLE = """

JavaBench Leaderboard

""" # What does your leaderboard evaluate? INTRODUCTION_TEXT = """

A Benchmark of Object-Oriented Code Generation for Evaluating Large Language Models

""" # Which evaluations are you running? how can people reproduce what you have? LLM_BENCHMARKS_TEXT = f""" ## How it works ## Reproducibility To reproduce our results, here is the commands you can run: """ EVALUATION_QUEUE_TEXT = """ Thank you for your interest in JavaBench. We warmly welcome researchers to submit additional benchmarking results, as we believe that collaborative efforts can significantly advance the study of Large Language Models and software engineering. For submission guidelines, please refer to our [Github Repo](https://github.com/java-bench/JavaBench?tab=readme-ov-file#usage). """ CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results" CITATION_BUTTON_TEXT = r""" @misc{cao2024aibeatundergraduatesentrylevel, title={Can AI Beat Undergraduates in Entry-level Java Assignments? Benchmarking Large Language Models on JavaBench}, author={Jialun Cao and Zhiyong Chen and Jiarong Wu and Shing-chi Cheung and Chang Xu}, year={2024}, eprint={2406.12902}, archivePrefix={arXiv}, primaryClass={cs.LG}, url={https://arxiv.org/abs/2406.12902}, } """