|
from dataclasses import dataclass |
|
from enum import Enum |
|
|
|
@dataclass |
|
class Task: |
|
benchmark: str |
|
metric: str |
|
col_name: str |
|
reference_url: str |
|
|
|
|
|
|
|
|
|
class Tasks(Enum): |
|
|
|
task0 = Task("aiera_transcript_sentiment", "accuracy,none","Sentiment", reference_url="https://huggingface.co/datasets/Aiera/aiera-transcript-sentiment") |
|
task1 = Task("aiera_ect_sum", "bert_f1,none","Summary", reference_url="https://huggingface.co/datasets/Aiera/aiera-ect-sum") |
|
task2 = Task("finqa", "exact_match_manual,none","Q&A", reference_url="https://huggingface.co/datasets/Aiera/finqa-verified") |
|
task3 = Task("aiera_speaker_assign", "accuracy,none", "Speaker ID", reference_url="https://huggingface.co/datasets/Aiera/aiera-speaker-assign") |
|
|
|
|
|
NUM_FEWSHOT = 0 |
|
|
|
|
|
|
|
LEADERBOARD_TITLE_PNG = "assets/aiera-leaderboard-transparent.png" |
|
|
|
TITLE = """<h1 align="center" id="space-title">Aiera Leaderboard</h1>""" |
|
|
|
|
|
INTRODUCTION_TEXT = """ |
|
The Aiera Leaderboard evaluates the performance of LLMs on a number of financial intelligence tasks including: |
|
* Assignments of speakers for event transcript segments and identification of speaker changes. |
|
* Abstractive summarizations of earnings call transcripts. |
|
* Calculation-based Q&A over financial text. |
|
* Financial sentiment tagging for transcript segments. |
|
|
|
A guide for eval tasks is avaliable on github at [https://github.com/aiera-inc/aiera-benchmark-tasks](https://github.com/aiera-inc/aiera-benchmark-tasks). |
|
""" |
|
|
|
|
|
LLM_BENCHMARKS_TEXT = f""" |
|
## How it works |
|
Models are evaluated on the following tasks |
|
* **aiera_speaker_assign**: Assignments of speakers for event transcript segments and identification of speaker changes. Dataset available on [huggingface](https://huggingface.co/datasets/Aiera/aiera-speaker-assign). |
|
* **aiera-ect-sum**: Abstractive summarizations of earnings call transcripts. Dataset available on [huggingface](https://huggingface.co/datasets/Aiera/aiera-ect-sum). |
|
* **finqa**: Calculation-based Q&A over financial text. Dataset available on [huggingface](https://huggingface.co/datasets/Aiera/finqa-verified). |
|
* **aiera-transcript-sentiment**: Event transcript segments with labels indicating the financial sentiment. Dataset available on [huggingface](https://huggingface.co/datasets/Aiera/aiera-transcript-sentiment). |
|
|
|
## Reproducibility |
|
A guide for running the above tasks using EleutherAi's lm-evaluation-harness is avaliable on github at [https://github.com/aiera-inc/aiera-benchmark-tasks](https://github.com/aiera-inc/aiera-benchmark-tasks). |
|
|
|
""" |
|
|
|
EVALUATION_QUEUE_TEXT = """ |
|
Note: The evaluation suite is only able to run on models available via Hugging Face's Serverless Inference API. Unfortunately, that means the models available for execution are limited, but we are working to support more models in the future. |
|
|
|
## In case of model failure |
|
If your model is displayed in the `FAILED` category, its execution stopped. |
|
Check you can launch the EleutherAIHarness on your model locally using the guide avaliable on github at [https://github.com/aiera-inc/aiera-benchmark-tasks](https://github.com/aiera-inc/aiera-benchmark-tasks). |
|
Models must be able to accomodate large context windows in order to run this evaluation. |
|
|
|
""" |
|
|
|
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results" |
|
CITATION_BUTTON_TEXT = r"""@misc{aiera-finance-leaderboard, |
|
author = {Jacqueline Garrahan, Bryan Healey}, |
|
title = {Aiera Finance Leaderboard}, |
|
year = {2024}, |
|
publisher = {Aiera}, |
|
howpublished = "\url{https://huggingface.co/spaces/Aiera/aiera-finance-leaderboard}" |
|
} |
|
@software{eval-harness, |
|
author = {Gao, Leo and |
|
Tow, Jonathan and |
|
Biderman, Stella and |
|
Black, Sid and |
|
DiPofi, Anthony and |
|
Foster, Charles and |
|
Golding, Laurence and |
|
Hsu, Jeffrey and |
|
McDonell, Kyle and |
|
Muennighoff, Niklas and |
|
Phang, Jason and |
|
Reynolds, Laria and |
|
Tang, Eric and |
|
Thite, Anish and |
|
Wang, Ben and |
|
Wang, Kevin and |
|
Zou, Andy}, |
|
title = {A framework for few-shot language model evaluation}, |
|
month = sep, |
|
year = 2021, |
|
publisher = {Zenodo}, |
|
version = {v0.0.1}, |
|
doi = {10.5281/zenodo.5371628}, |
|
url = {https://doi.org/10.5281/zenodo.5371628} |
|
} |
|
""" |
|
|