|
from dataclasses import dataclass |
|
from enum import Enum |
|
|
|
@dataclass |
|
class Task: |
|
benchmark: str |
|
metric: str |
|
col_name: str |
|
|
|
|
|
|
|
|
|
class Tasks(Enum): |
|
|
|
|
|
|
|
|
|
|
|
mrr = Task("retrieval", "mrr", "MRR β¬οΈ") |
|
map = Task("retrieval", "map", "MAP β¬οΈ") |
|
|
|
|
|
em = Task("generation", "em", "EM β¬οΈ") |
|
f1 = Task("generation", "f1", "F1 β¬οΈ") |
|
rouge1 = Task("generation", "rouge1", "Rouge-1 β¬οΈ") |
|
rouge2 = Task("generation", "rouge2", "Rouge-2 β¬οΈ") |
|
rougeL = Task("generation", "rougeL", "Rouge-L β¬οΈ") |
|
|
|
accuracy = Task("generation", "accuracy", "ACC β¬οΈ") |
|
completeness = Task("generation", "completeness", "COMP β¬οΈ") |
|
hallucination = Task("generation", "hallucination", "HAL β¬οΈ") |
|
utilization = Task("generation", "utilization", "UTIL β¬οΈ") |
|
numerical_accuracy = Task("generation", "numerical_accuracy", "MACC β¬οΈ") |
|
|
|
|
|
NUM_FEWSHOT = 0 |
|
|
|
|
|
|
|
|
|
|
|
TITLE = """<h1 align="center" id="space-title">π
OmniEval Leaderboard</h1>""" |
|
|
|
|
|
INTRODUCTION_TEXT = """ |
|
<div align="center">OmniEval: Omnidirectional and Automatic RAG Evaluation Benchmark in Financial Domain</div> |
|
""" |
|
|
|
|
|
LLM_BENCHMARKS_TEXT = f""" |
|
# <div align="center">OmniEval: Omnidirectional and Automatic RAG Evaluation Benchmark in Financial Domain</div> |
|
|
|
|
|
<div align="center"> |
|
<!-- <a href="https://arxiv.org/abs/2405.13576" target="_blank"><img src=https://img.shields.io/badge/arXiv-b5212f.svg?logo=arxiv></a> --> |
|
<!-- <a href="https://huggingface.co/datasets/RUC-NLPIR/FlashRAG_datasets/" target="_blank"><img src=https://img.shields.io/badge/%F0%9F%A4%97%20HuggingFace%20Datasets-27b3b4.svg></a> --> |
|
<!-- <a href="https://huggingface.co/ShootingWong/OmniEval-ModelEvaluator" target="_blank"><img src=https://img.shields.io/badge/%F0%9F%A4%97%20HuggingFace%20Checkpoint-5fc372.svg></a> --> |
|
<!-- <a href="https://huggingface.co/ShootingWong/OmniEval-HallucinationEvaluator" target="_blank"><img src=https://img.shields.io/badge/%F0%9F%A4%97%20HuggingFace%20Checkpoint-b181d9.svg></a> --> |
|
<a href="https://huggingface.co/datasets/RUC-NLPIR/FlashRAG_datasets/" target="_blank"><img src=https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Dataset-27b3b4></a> |
|
<a href="https://huggingface.co/ShootingWong/OmniEval-ModelEvaluator" target="_blank"><img src=https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Checkpoint-5fc372></a> |
|
<a href="https://huggingface.co/ShootingWong/OmniEval-HallucinationEvaluator" target="_blank"><img src=https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Checkpoint-b181d9></a> |
|
<a href="https://huggingface.co/spaces/NLPIR-RAG/OmniEval" target="_blank"><img src=https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Leaderboard-blue></a> |
|
<a href="https://github.com/RUC-NLPIR/FlashRAG/blob/main/LICENSE"><img alt="License" src="https://img.shields.io/badge/LICENSE-MIT-green"></a> |
|
<a><img alt="Static Badge" src="https://img.shields.io/badge/made_with-Python-blue"></a> |
|
</div> |
|
|
|
<!-- [![HuggingFace](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Leaderboard-blue)](https://huggingface.co/spaces/Vchitect/VBench_Leaderboard) --> |
|
|
|
<h4 align="center"> |
|
|
|
<p> |
|
<a href="#wrench-installation">Installation</a> | |
|
<!-- <a href="#sparkles-features">Features</a> | --> |
|
<a href="#rocket-quick-start">Quick-Start</a> | |
|
<a href="#bookmark-license">License</a> | |
|
<a href="#star2-citation">Citation</a> |
|
|
|
</p> |
|
|
|
</h4> |
|
|
|
<!-- |
|
With FlashRAG and provided resources, you can effortlessly reproduce existing SOTA works in the RAG domain or implement your custom RAG processes and components. --> |
|
|
|
|
|
## π§ Installation |
|
`conda env create -f environment.yml && conda activate finrag` |
|
|
|
<!-- ## β¨ Features |
|
1. --> |
|
## π Quick-Start |
|
Notion: |
|
1. The code run path is `./OpenFinBench` |
|
2. We provide our auto-generated evaluation dataset in <a href="https://huggingface.co/datasets/RUC-NLPIR/FlashRAG_datasets/" target="_blank"><img src=https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Dataset-27b3b4></a> |
|
### 1. Build the Retrieval Corpus |
|
``` |
|
# cd OpenFinBench |
|
sh corpus_builder/build_corpus.sh # Please see the annotation inner the bash file to set parameters. |
|
``` |
|
### 2. Generate Evaluation Data Samples |
|
1. Generate evaluation instances |
|
``` |
|
# cd OpenFinBench |
|
sh data_generator/generate_data.sh |
|
``` |
|
2. Filter (quality inspection) evaluation instances |
|
``` |
|
sh data_generator/generate_data_filter.sh |
|
``` |
|
### 3. Inference Your Models |
|
``` |
|
# cd OpenFinBench |
|
sh evaluator/inference/rag_inference.sh |
|
``` |
|
### 4. Evaluate Your Models |
|
#### (a) Rule-based Evaluation |
|
``` |
|
# cd OpenFinBench |
|
sh evaluator/judgement/judger.sh # by setting judge_type="rule" |
|
``` |
|
#### (b) Model-based Evalution |
|
We propose five model-based metric: accuracy, completeness, utilization, numerical_accuracy, and hallucination. We have trained two models from Qwen2.5-7B by the lora strategy and human-annotation labels to implement model-based evaluation. |
|
|
|
Note that the evaluator of hallucination is different from other four. Their model checkpoint can be load from the following huggingface links: |
|
1. The evaluator for hallucination metric: <a href="https://huggingface.co/ShootingWong/OmniEval-HallucinationEvaluator" target="_blank"><img src=https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Checkpoint-b181d9></a> |
|
2. The evaluator for other metric: <a href="https://huggingface.co/ShootingWong/OmniEval-ModelEvaluator" target="_blank"><img src=https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Checkpoint-5fc372></a> |
|
|
|
|
|
|
|
To implement model-based evaluation, you can first set up two vllm servers by the following codes: |
|
``` |
|
``` |
|
|
|
Then conduct the model-based evaluate using the following codes, (change the parameters inner the bash file). |
|
``` |
|
sh evaluator/judgement/judger.sh |
|
``` |
|
|
|
## π License |
|
|
|
OmniEval is licensed under the [<u>MIT License</u>](./LICENSE). |
|
|
|
## π Citation |
|
The paper is waiting to be released! |
|
|
|
<!-- # Check Infos |
|
## Pipeline |
|
1. Build corpus |
|
2. Data generation |
|
3. RAG inference |
|
4. Result evaluatioin |
|
|
|
## Code |
|
1. remove "baichuan" |
|
2. remove useless annotation --> |
|
|
|
|
|
""" |
|
|
|
EVALUATION_QUEUE_TEXT = """ |
|
## Some good practices before submitting a model |
|
|
|
### 1) Make sure you can load your model and tokenizer using AutoClasses: |
|
```python |
|
from transformers import AutoConfig, AutoModel, AutoTokenizer |
|
config = AutoConfig.from_pretrained("your model name", revision=revision) |
|
model = AutoModel.from_pretrained("your model name", revision=revision) |
|
tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision) |
|
``` |
|
If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded. |
|
|
|
Note: make sure your model is public! |
|
Note: if your model needs `use_remote_code=True`, we do not support this option yet but we are working on adding it, stay posted! |
|
|
|
### 2) Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index) |
|
It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`! |
|
|
|
### 3) Make sure your model has an open license! |
|
This is a leaderboard for Open LLMs, and we'd love for as many people as possible to know they can use your model π€ |
|
|
|
### 4) Fill up your model card |
|
When we add extra information about models to the leaderboard, it will be automatically taken from the model card |
|
|
|
## In case of model failure |
|
If your model is displayed in the `FAILED` category, its execution stopped. |
|
Make sure you have followed the above steps first. |
|
If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task). |
|
""" |
|
|
|
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results" |
|
CITATION_BUTTON_TEXT = r""" |
|
""" |
|
|