File size: 3,036 Bytes
e16fd64
 
923aff9
5aa2f68
54f2589
 
923aff9
 
 
 
 
e16fd64
35378f6
923aff9
250513d
 
69c36b6
250513d
8bf318d
923aff9
35378f6
c9c6b9a
35378f6
e16fd64
 
69c36b6
 
 
 
e16fd64
 
250513d
 
 
 
 
 
 
 
 
 
 
 
1a5bdd9
 
250513d
 
 
 
 
1a5bdd9
 
250513d
1a5bdd9
250513d
 
1a5bdd9
250513d
 
 
 
1a5bdd9
 
 
 
 
923aff9
 
e16fd64
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
TITLE = """<h1 align="center" id="space-title"> ๐Ÿ† CLEM Leaderboard</h1>"""

REPO = "https://raw.githubusercontent.com/clembench/clembench-runs/main/"
REGISTRY_URL = "https://raw.githubusercontent.com/clp-research/clemcore/refs/heads/main/clemcore/backends/model_registry.json"
BENCHMARK_FILE = "benchmark_runs.json"

HF_REPO = "colab-potsdam/clem-leaderboard"

TEXT_NAME = "๐Ÿฅ‡ CLEM Leaderboard"
MULTIMODAL_NAME = "๐Ÿฅ‡ Multimodal CLEM Leaderboard"

INTRODUCTION_TEXT = """
<h6 align="center">

The CLEM Leaderboard aims to track, rank and evaluate current cLLMs (chat-optimized Large Language Models) with the suggested pronounciation โ€œclemsโ€. 

The benchmarking approach is described in [Clembench: Using Game Play to Evaluate Chat-Optimized Language Models as Conversational Agents](https://aclanthology.org/2023.emnlp-main.689.pdf).

The multimodal benchmark is described in [Using Game Play to Investigate Multimodal and Conversational Grounding in Large Multimodal Models](https://arxiv.org/abs/2406.14035)

Source code for benchmarking "clems" is available here: [Clembench](https://github.com/clembench/clembench)

All generated files and results from the benchmark runs are available here: [clembench-runs](https://github.com/clembench/clembench-runs) </h6>
"""

CLEMSCORE_TEXT = """
The <i>clemscore</i> combines a score representing the overall ability to just follow the game instructions (separately scored in field <i>Played</i>) and the quality of the play in attempt where instructions were followed (field <i>Quality Scores</i>). For details about the games / interaction settings, and for results on older versions of the benchmark, see the tab <i>Versions and Details</i>.
"""

SHORT_NAMES = {
    "t0.0": "",
    "claude-v1.3": "cl-1.3",
    "claude-2": "cl-2",
    "claude-2.1": "cl-2.1",
    "claude-instant-1.2": "cl-ins-1.2",
    "gpt-3.5-turbo-0613": "3.5-0613",
    "gpt-3.5-turbo-1106": "3.5-1106",
    "gpt-4-0613": "4-0613",
    "gpt-4-1106-preview": "4-1106",
    "gpt-4-0314": "4-0314",
    "gpt-4": "4",
    "text-davinci-003": "3",
    "luminous-supreme": "lm",
    "koala-13b": "k-13b",
    "falcon-40b": "fal-40b",
    "falcon-7b-instruct": "fal-7b",
    "falcon-40b-instruct": "flc-i-40b",
    "oasst-12b": "oas-12b",
    "oasst-sft-4-pythia-12b-epoch-3.5": "ost-12b",
    "vicuna-13b": "vic-13b",
    "vicuna-33b-v1.3": "vic-33b-v1.3",
    "sheep-duck-llama-2-70b-v1.1": "sd-l2-70b-v1.1",
    "sheep-duck-llama-2-13b": "sd-l2-13b",
    "WizardLM-70b-v1.0": "w-70b-v1.0",
    "CodeLlama-34b-Instruct-hf": "cl-34b",
    "command": "com",
    "Mistral-7B-Instruct-v0.1": "m-i-7b-v0.1",
    "Wizard-Vicuna-13B-Uncensored-HF": "vcn-13b",
    "llama-2-13b-chat-hf": "l2-13b",
    "llama-2-70b-chat-hf": "l2-70b",
    "llama-2-7b-chat-hf": "l2-7b",
    "koala-13B-HF": "k-13b",
    "WizardLM-13b-v1.2": "w-13b-v1.2",
    "vicuna-7b-v1.5": "vic-7b-v1.5",
    "vicuna-13b-v1.5": "vic-13b-v1.5",
    "gpt4all-13b-snoozy": "g4a-13b-s",
    "zephyr-7b-alpha": "z-7b-a",
    "zephyr-7b-beta": "z-7b-b"
}