Spaces:
Running
Running
File size: 3,036 Bytes
e16fd64 923aff9 5aa2f68 54f2589 923aff9 e16fd64 35378f6 923aff9 250513d 69c36b6 250513d 8bf318d 923aff9 35378f6 c9c6b9a 35378f6 e16fd64 69c36b6 e16fd64 250513d 1a5bdd9 250513d 1a5bdd9 250513d 1a5bdd9 250513d 1a5bdd9 250513d 1a5bdd9 923aff9 e16fd64 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 |
TITLE = """<h1 align="center" id="space-title"> ๐ CLEM Leaderboard</h1>"""
REPO = "https://raw.githubusercontent.com/clembench/clembench-runs/main/"
REGISTRY_URL = "https://raw.githubusercontent.com/clp-research/clemcore/refs/heads/main/clemcore/backends/model_registry.json"
BENCHMARK_FILE = "benchmark_runs.json"
HF_REPO = "colab-potsdam/clem-leaderboard"
TEXT_NAME = "๐ฅ CLEM Leaderboard"
MULTIMODAL_NAME = "๐ฅ Multimodal CLEM Leaderboard"
INTRODUCTION_TEXT = """
<h6 align="center">
The CLEM Leaderboard aims to track, rank and evaluate current cLLMs (chat-optimized Large Language Models) with the suggested pronounciation โclemsโ.
The benchmarking approach is described in [Clembench: Using Game Play to Evaluate Chat-Optimized Language Models as Conversational Agents](https://aclanthology.org/2023.emnlp-main.689.pdf).
The multimodal benchmark is described in [Using Game Play to Investigate Multimodal and Conversational Grounding in Large Multimodal Models](https://arxiv.org/abs/2406.14035)
Source code for benchmarking "clems" is available here: [Clembench](https://github.com/clembench/clembench)
All generated files and results from the benchmark runs are available here: [clembench-runs](https://github.com/clembench/clembench-runs) </h6>
"""
CLEMSCORE_TEXT = """
The <i>clemscore</i> combines a score representing the overall ability to just follow the game instructions (separately scored in field <i>Played</i>) and the quality of the play in attempt where instructions were followed (field <i>Quality Scores</i>). For details about the games / interaction settings, and for results on older versions of the benchmark, see the tab <i>Versions and Details</i>.
"""
SHORT_NAMES = {
"t0.0": "",
"claude-v1.3": "cl-1.3",
"claude-2": "cl-2",
"claude-2.1": "cl-2.1",
"claude-instant-1.2": "cl-ins-1.2",
"gpt-3.5-turbo-0613": "3.5-0613",
"gpt-3.5-turbo-1106": "3.5-1106",
"gpt-4-0613": "4-0613",
"gpt-4-1106-preview": "4-1106",
"gpt-4-0314": "4-0314",
"gpt-4": "4",
"text-davinci-003": "3",
"luminous-supreme": "lm",
"koala-13b": "k-13b",
"falcon-40b": "fal-40b",
"falcon-7b-instruct": "fal-7b",
"falcon-40b-instruct": "flc-i-40b",
"oasst-12b": "oas-12b",
"oasst-sft-4-pythia-12b-epoch-3.5": "ost-12b",
"vicuna-13b": "vic-13b",
"vicuna-33b-v1.3": "vic-33b-v1.3",
"sheep-duck-llama-2-70b-v1.1": "sd-l2-70b-v1.1",
"sheep-duck-llama-2-13b": "sd-l2-13b",
"WizardLM-70b-v1.0": "w-70b-v1.0",
"CodeLlama-34b-Instruct-hf": "cl-34b",
"command": "com",
"Mistral-7B-Instruct-v0.1": "m-i-7b-v0.1",
"Wizard-Vicuna-13B-Uncensored-HF": "vcn-13b",
"llama-2-13b-chat-hf": "l2-13b",
"llama-2-70b-chat-hf": "l2-70b",
"llama-2-7b-chat-hf": "l2-7b",
"koala-13B-HF": "k-13b",
"WizardLM-13b-v1.2": "w-13b-v1.2",
"vicuna-7b-v1.5": "vic-7b-v1.5",
"vicuna-13b-v1.5": "vic-13b-v1.5",
"gpt4all-13b-snoozy": "g4a-13b-s",
"zephyr-7b-alpha": "z-7b-a",
"zephyr-7b-beta": "z-7b-b"
}
|