from dataclasses import dataclass from enum import Enum @dataclass class Task: benchmark: str metric: str col_name: str # Select your tasks here # --------------------------------------------------- class Tasks(Enum): # task_key in the json file, metric_key in the json file, name to display in the leaderboard task0 = Task("Count", "acc", "Count") task1 = Task("Order", "acc", "Order") task2 = Task("VCR", "acc", "VCR") task3 = Task("Culture", "acc", "Culture") task4 = Task("Trick", "acc", "Trick") class N_Tasks(Enum): task0_f1 = Task("Count", "f1", "Count") task1_f1 = Task("Order", "f1", "Order") task3_f1 = Task("Culture", "f1", "Culture") task4_f1 = Task("Trick", "f1", "Trick") class Detail_Tasks(Enum): task0_0 = Task("Count 0", "acc", "Count 0") task0_1 = Task("Count 1", "acc", "Count 1") task0_2 = Task("Count 2", "acc", "Count 2") task0_3 = Task("Count 3", "acc", "Count 3") task0_4 = Task("Count 4", "acc", "Count 4") task1_0 = Task("Order 0", "acc", "Order 0") task1_1 = Task("Order 1", "acc", "Order 1") task1_2 = Task("Order 2", "acc", "Order 2") task1_3 = Task("Order 3", "acc", "Order 3") task1_4 = Task("Order 4", "acc", "Order 4") task3_0 = Task("Culture 0", "acc", "Culture 0") task3_1 = Task("Culture 1", "acc", "Culture 1") task3_2 = Task("Culture 2", "acc", "Culture 2") task3_3 = Task("Culture 3", "acc", "Culture 3") task3_4 = Task("Culture 4", "acc", "Culture 4") task4_0 = Task("Trick 0", "acc", "Trick 0") task4_1 = Task("Trick 1", "acc", "Trick 1") task4_2 = Task("Trick 2", "acc", "Trick 2") task4_3 = Task("Trick 3", "acc", "Trick 3") task4_4 = Task("Trick 4", "acc", "Trick 4") NUM_FEWSHOT = 0 # Change with your few shot # --------------------------------------------------- # Your leaderboard name TITLE = """

DARE leaderboard

""" # What does your leaderboard evaluate? INTRODUCTION_TEXT = """ """ # Which evaluations are you running? how can people reproduce what you have? LLM_BENCHMARKS_TEXT = f""" ## DARE The DARE dataset targets a variety of skills and evaluation settings, for details see our [paper](https://arxiv.org/pdf/2409.18023). Submit your predictions to evaluate your model. """ EVALUATION_QUEUE_TEXT = """ ## Evaluation Queue To add your model to the evaluation queue, please submit a file with your model predictions with the id and predicted options. The file should be in the following format: ``` [ {"id": "0", "prediction": ["A"]}, {"id": "1", "prediction": ["B"]}, {"id": "2", "prediction": ["A", "C"]}, ... ] ``` Make sure to include the following information in your submission: - Model name - Output format - Task version """ CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results" CITATION_BUTTON_TEXT = r""" @article{sterz2024dare, title={DARE: Diverse Visual Question Answering with Robustness Evaluation}, author={Sterz, Hannah and Pfeiffer, Jonas and Vuli{\'c}, Ivan}, journal={arXiv preprint arXiv:2409.18023}, year={2024} } """