Jacqueline Garrahan
commited on
Commit
•
57dbb7f
1
Parent(s):
0ccfdb5
check in about
Browse files- src/about.py +53 -10
src/about.py
CHANGED
@@ -13,10 +13,11 @@ class Task:
|
|
13 |
# ---------------------------------------------------
|
14 |
class Tasks(Enum):
|
15 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
16 |
-
task0 = Task("
|
17 |
-
task1 = Task("
|
18 |
-
|
19 |
-
|
|
|
20 |
#task7 = Task("flare_ner", "accuracy,none","flare-ner", reference_url="test")
|
21 |
|
22 |
|
@@ -29,17 +30,26 @@ TITLE = """<h1 align="center" id="space-title">Aiera Leaderboard</h1>"""
|
|
29 |
|
30 |
# What does your leaderboard evaluate?
|
31 |
INTRODUCTION_TEXT = """
|
32 |
-
The Aiera
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
"""
|
34 |
|
35 |
# Which evaluations are you running? how can people reproduce what you have?
|
36 |
LLM_BENCHMARKS_TEXT = f"""
|
37 |
## How it works
|
38 |
-
|
39 |
-
|
|
|
|
|
|
|
40 |
|
41 |
## Reproducibility
|
42 |
-
A guide for running
|
43 |
|
44 |
"""
|
45 |
|
@@ -70,9 +80,42 @@ When we add extra information about models to the leaderboard, it will be automa
|
|
70 |
## In case of model failure
|
71 |
If your model is displayed in the `FAILED` category, its execution stopped.
|
72 |
Make sure you have followed the above steps first.
|
73 |
-
If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
|
|
|
74 |
"""
|
75 |
|
76 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
77 |
-
CITATION_BUTTON_TEXT = r"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
78 |
"""
|
|
|
13 |
# ---------------------------------------------------
|
14 |
class Tasks(Enum):
|
15 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
16 |
+
task0 = Task("aiera_transcript_sentiment", "accuracy,none","Sentiment", reference_url="https://huggingface.co/datasets/Aiera/aiera-transcript-sentiment")
|
17 |
+
task1 = Task("aiera_ect_sum", "rougeLsum,none","Summary", reference_url="https://huggingface.co/datasets/Aiera/aiera-ect-sum")
|
18 |
+
task2 = Task("finqa", "exact_match_manual,none","Q&A", reference_url="https://huggingface.co/datasets/Aiera/finqa-verified")
|
19 |
+
task3 = Task("aiera_speaker_assign", "accuracy,none", "Speaker ID", reference_url="https://huggingface.co/datasets/Aiera/aiera-speaker-assign")
|
20 |
+
|
21 |
#task7 = Task("flare_ner", "accuracy,none","flare-ner", reference_url="test")
|
22 |
|
23 |
|
|
|
30 |
|
31 |
# What does your leaderboard evaluate?
|
32 |
INTRODUCTION_TEXT = """
|
33 |
+
The Aiera Leaderboard evaluates the performance of LLMs on a number of financial intelligence tasks including:
|
34 |
+
* Assignments of speakers for event transcript segments and identification of speaker changes.
|
35 |
+
* Abstractive summarizations of earnings call transcripts.
|
36 |
+
* Calculation-based Q&A over financial text.
|
37 |
+
* Financial sentiment tagging for transcript segments.
|
38 |
+
|
39 |
+
A guide for eval tasks is avaliable on github at [https://github.com/aiera-inc/aiera-benchmark-tasks](https://github.com/aiera-inc/aiera-benchmark-tasks).
|
40 |
"""
|
41 |
|
42 |
# Which evaluations are you running? how can people reproduce what you have?
|
43 |
LLM_BENCHMARKS_TEXT = f"""
|
44 |
## How it works
|
45 |
+
Models are evaluated on the following tasks
|
46 |
+
* **aiera_speaker_assign**: Assignments of speakers for event transcript segments and identification of speaker changes. Dataset available on [huggingface](https://huggingface.co/datasets/Aiera/aiera-speaker-assign).
|
47 |
+
* **aiera-ect-sum**: Abstractive summarizations of earnings call transcripts. Dataset available on [huggingface](https://huggingface.co/datasets/Aiera/aiera-ect-sum).
|
48 |
+
* **finqa**: Calculation-based Q&A over financial text. Dataset available on [huggingface](https://huggingface.co/datasets/Aiera/finqa-verified).
|
49 |
+
* **aiera-transcript-sentiment**: Event transcript segments with labels indicating the financial sentiment. Dataset available on [huggingface](https://huggingface.co/datasets/Aiera/aiera-transcript-sentiment).
|
50 |
|
51 |
## Reproducibility
|
52 |
+
A guide for running the above tasks using EleutherAi's lm-evaluation-harness is avaliable on github at [https://github.com/aiera-inc/aiera-benchmark-tasks](https://github.com/aiera-inc/aiera-benchmark-tasks).
|
53 |
|
54 |
"""
|
55 |
|
|
|
80 |
## In case of model failure
|
81 |
If your model is displayed in the `FAILED` category, its execution stopped.
|
82 |
Make sure you have followed the above steps first.
|
83 |
+
If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task). A guide for running the Aiera's tasks using EleutherAi's lm-evaluation-harness is avaliable on github at [https://github.com/aiera-inc/aiera-benchmark-tasks](https://github.com/aiera-inc/aiera-benchmark-tasks).
|
84 |
+
|
85 |
"""
|
86 |
|
87 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
88 |
+
CITATION_BUTTON_TEXT = r"""@misc{aiera-finance-leaderboard,
|
89 |
+
author = {Jacqueline Garrahan, Bryan Healey},
|
90 |
+
title = {Aiera Finance Leaderboard},
|
91 |
+
year = {2024},
|
92 |
+
publisher = {Aiera},
|
93 |
+
howpublished = "\url{https://huggingface.co/spaces/Aiera/aiera-finance-leaderboard}"
|
94 |
+
}
|
95 |
+
@software{eval-harness,
|
96 |
+
author = {Gao, Leo and
|
97 |
+
Tow, Jonathan and
|
98 |
+
Biderman, Stella and
|
99 |
+
Black, Sid and
|
100 |
+
DiPofi, Anthony and
|
101 |
+
Foster, Charles and
|
102 |
+
Golding, Laurence and
|
103 |
+
Hsu, Jeffrey and
|
104 |
+
McDonell, Kyle and
|
105 |
+
Muennighoff, Niklas and
|
106 |
+
Phang, Jason and
|
107 |
+
Reynolds, Laria and
|
108 |
+
Tang, Eric and
|
109 |
+
Thite, Anish and
|
110 |
+
Wang, Ben and
|
111 |
+
Wang, Kevin and
|
112 |
+
Zou, Andy},
|
113 |
+
title = {A framework for few-shot language model evaluation},
|
114 |
+
month = sep,
|
115 |
+
year = 2021,
|
116 |
+
publisher = {Zenodo},
|
117 |
+
version = {v0.0.1},
|
118 |
+
doi = {10.5281/zenodo.5371628},
|
119 |
+
url = {https://doi.org/10.5281/zenodo.5371628}
|
120 |
+
}
|
121 |
"""
|