File size: 2,887 Bytes
7e98abb
 
 
08b45d4
7e98abb
 
 
 
 
56451ad
 
7e98abb
 
 
 
 
08b45d4
6b7cc2e
 
e3d6a90
6b7cc2e
e3d6a90
 
 
6b7cc2e
56451ad
e3d6a90
 
56451ad
6b7cc2e
e3d6a90
6b7cc2e
1c41f75
e3d6a90
7e98abb
 
08b45d4
 
7e98abb
 
 
40fda8f
7e98abb
 
 
6b7cc2e
56451ad
 
 
7e98abb
 
6b7cc2e
 
7e98abb
 
 
 
 
6b7cc2e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7e98abb
 
 
6b7cc2e
7e98abb
 
 
 
6b7cc2e
7e98abb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
from dataclasses import dataclass
from enum import Enum


@dataclass
class Task:
    benchmark: str
    metric: str
    col_name: str
    higher_is_better: bool = True
    scale_by_100: bool = True


# Select your tasks here
# ---------------------------------------------------
class Tasks(Enum):
    # task_key in the json file, metric_key in the json file, name to display in the leaderboard
    task1 = Task("ami_2020_aggressiveness", "f1,none", "AMI 2020 Agg")
    task2 = Task("ami_2020_misogyny", "f1,none", "AMI 2020 Miso")
    task0 = Task("arc_challenge_ita", "acc_norm,none", "ARC-C")
    task4 = Task("belebele_ita", "acc_norm,none", "Belebele")
    task3 = Task("gente_rephrasing", "acc,none", "GeNTE Neutralizing")
    task12 = Task("haspeede2_hs", "f1,none", "HaSpeeDe2 HS")
    task13 = Task("haspeede2_stereo", "f1,none", "HaSpeeDe2 Stereo")
    task5 = Task("hatecheck_ita", "f1,none", "HateCheck")
    task6 = Task("honest_ita", "acc,none", "HONEST", higher_is_better=False)
    task14 = Task("ironita_irony", "f1,none", "IronITA Irony")
    task15 = Task("ironita_sarcasm", "f1,none", "IronITA Sarcasm")
    task7 = Task("itacola", "mcc,none", "ItaCoLA", scale_by_100=False)
    task8 = Task("news_sum", "bertscore,none", "News Sum")
    task16 = Task("sentipolc", "f1,none", "SENTIPOLC")
    task9 = Task("squad_it", "squad_f1,get-answer", "SQuAD it")
    task10 = Task("truthfulqa_mc2_ita", "acc,none", "TruthfulQA")
    task11 = Task("xcopa_it", "acc,none", "XCOPA")


NUM_FEWSHOT = 0  # Change with your few shot
# ---------------------------------------------------


# Your leaderboard name
TITLE = """<h1 align="center" id="space-title">ItaEval leaderboard</h1>"""

# What does your leaderboard evaluate?
INTRODUCTION_TEXT = """
This leaderboard evaluates language models on <b>ItaEval</b>, a new unified benchmark for Italian.

Some information:
- compared to other leaderboard you may found online, we do not support automatic evaluation for new model submissions
"""

ITA_EVAL_REPO = "https://github.com/g8a9/ita-eval"

# Which evaluations are you running? how can people reproduce what you have?
LLM_BENCHMARKS_TEXT = f"""
## How it works

## Reproducibility
To reproduce our results, head to {ITA_EVAL_REPO} for all the instructions.

If all the setup goes smoothly, you can run 'MODEL' on ItaEval with:
```bash
MODEL="..."
lm_eval -mixed_precision=bf16 --model hf \
    --model_args pretrained=$MODEL,dtype=bfloat16 \
    --tasks ita_eval \
    --device cuda:0 \
    --batch_size "auto" \
    --log_samples \
    --output_path $FAST/ita_eval_v1/$MODEL \
    --use_cache $FAST/ita_eval_v1/$MODEL \
    --cache_requests "true"
```
"""

EVALUATION_QUEUE_TEXT = """
We do not plan to accept autonomous submissions, yet. 
"""

CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
CITATION_BUTTON_TEXT = r"""
We are working on it! :)
"""