Jacqueline Garrahan commited on
Commit
57dbb7f
1 Parent(s): 0ccfdb5

check in about

Browse files
Files changed (1) hide show
  1. src/about.py +53 -10
src/about.py CHANGED
@@ -13,10 +13,11 @@ class Task:
13
  # ---------------------------------------------------
14
  class Tasks(Enum):
15
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
16
- task0 = Task("aiera_speaker_assign", "accuracy,none", "Speaker ID", reference_url="https://huggingface.co/datasets/Aiera/aiera-speaker-assign")
17
- task1 = Task("aiera_transcript_sentiment", "accuracy,none","Sentiment", reference_url="https://huggingface.co/datasets/Aiera/aiera-transcript-sentiment")
18
- task4 = Task("aiera_ect_sum", "rougeLsum,none","aiera_ect_sum", reference_url="https://huggingface.co/datasets/Aiera/aiera-ect-sum")
19
- task5 = Task("finqa", "exact_match_manual,none","finqa", reference_url="https://huggingface.co/datasets/Aiera/finqa-verified")
 
20
  #task7 = Task("flare_ner", "accuracy,none","flare-ner", reference_url="test")
21
 
22
 
@@ -29,17 +30,26 @@ TITLE = """<h1 align="center" id="space-title">Aiera Leaderboard</h1>"""
29
 
30
  # What does your leaderboard evaluate?
31
  INTRODUCTION_TEXT = """
32
- The Aiera Financial Leaderboard evaluates the performance of LLMs on ...
 
 
 
 
 
 
33
  """
34
 
35
  # Which evaluations are you running? how can people reproduce what you have?
36
  LLM_BENCHMARKS_TEXT = f"""
37
  ## How it works
38
-
39
- Proof something happened
 
 
 
40
 
41
  ## Reproducibility
42
- A guide for running Aiera's tasks using EleutherAi's lm-evaluation-harness is avaliable on github at [https://github.com/aiera-inc/aiera-benchmark-tasks](https://github.com/aiera-inc/aiera-benchmark-tasks).
43
 
44
  """
45
 
@@ -70,9 +80,42 @@ When we add extra information about models to the leaderboard, it will be automa
70
  ## In case of model failure
71
  If your model is displayed in the `FAILED` category, its execution stopped.
72
  Make sure you have followed the above steps first.
73
- If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
 
74
  """
75
 
76
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
77
- CITATION_BUTTON_TEXT = r"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
  """
 
13
  # ---------------------------------------------------
14
  class Tasks(Enum):
15
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
16
+ task0 = Task("aiera_transcript_sentiment", "accuracy,none","Sentiment", reference_url="https://huggingface.co/datasets/Aiera/aiera-transcript-sentiment")
17
+ task1 = Task("aiera_ect_sum", "rougeLsum,none","Summary", reference_url="https://huggingface.co/datasets/Aiera/aiera-ect-sum")
18
+ task2 = Task("finqa", "exact_match_manual,none","Q&A", reference_url="https://huggingface.co/datasets/Aiera/finqa-verified")
19
+ task3 = Task("aiera_speaker_assign", "accuracy,none", "Speaker ID", reference_url="https://huggingface.co/datasets/Aiera/aiera-speaker-assign")
20
+
21
  #task7 = Task("flare_ner", "accuracy,none","flare-ner", reference_url="test")
22
 
23
 
 
30
 
31
  # What does your leaderboard evaluate?
32
  INTRODUCTION_TEXT = """
33
+ The Aiera Leaderboard evaluates the performance of LLMs on a number of financial intelligence tasks including:
34
+ * Assignments of speakers for event transcript segments and identification of speaker changes.
35
+ * Abstractive summarizations of earnings call transcripts.
36
+ * Calculation-based Q&A over financial text.
37
+ * Financial sentiment tagging for transcript segments.
38
+
39
+ A guide for eval tasks is avaliable on github at [https://github.com/aiera-inc/aiera-benchmark-tasks](https://github.com/aiera-inc/aiera-benchmark-tasks).
40
  """
41
 
42
  # Which evaluations are you running? how can people reproduce what you have?
43
  LLM_BENCHMARKS_TEXT = f"""
44
  ## How it works
45
+ Models are evaluated on the following tasks
46
+ * **aiera_speaker_assign**: Assignments of speakers for event transcript segments and identification of speaker changes. Dataset available on [huggingface](https://huggingface.co/datasets/Aiera/aiera-speaker-assign).
47
+ * **aiera-ect-sum**: Abstractive summarizations of earnings call transcripts. Dataset available on [huggingface](https://huggingface.co/datasets/Aiera/aiera-ect-sum).
48
+ * **finqa**: Calculation-based Q&A over financial text. Dataset available on [huggingface](https://huggingface.co/datasets/Aiera/finqa-verified).
49
+ * **aiera-transcript-sentiment**: Event transcript segments with labels indicating the financial sentiment. Dataset available on [huggingface](https://huggingface.co/datasets/Aiera/aiera-transcript-sentiment).
50
 
51
  ## Reproducibility
52
+ A guide for running the above tasks using EleutherAi's lm-evaluation-harness is avaliable on github at [https://github.com/aiera-inc/aiera-benchmark-tasks](https://github.com/aiera-inc/aiera-benchmark-tasks).
53
 
54
  """
55
 
 
80
  ## In case of model failure
81
  If your model is displayed in the `FAILED` category, its execution stopped.
82
  Make sure you have followed the above steps first.
83
+ If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task). A guide for running the Aiera's tasks using EleutherAi's lm-evaluation-harness is avaliable on github at [https://github.com/aiera-inc/aiera-benchmark-tasks](https://github.com/aiera-inc/aiera-benchmark-tasks).
84
+
85
  """
86
 
87
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
88
+ CITATION_BUTTON_TEXT = r"""@misc{aiera-finance-leaderboard,
89
+ author = {Jacqueline Garrahan, Bryan Healey},
90
+ title = {Aiera Finance Leaderboard},
91
+ year = {2024},
92
+ publisher = {Aiera},
93
+ howpublished = "\url{https://huggingface.co/spaces/Aiera/aiera-finance-leaderboard}"
94
+ }
95
+ @software{eval-harness,
96
+ author = {Gao, Leo and
97
+ Tow, Jonathan and
98
+ Biderman, Stella and
99
+ Black, Sid and
100
+ DiPofi, Anthony and
101
+ Foster, Charles and
102
+ Golding, Laurence and
103
+ Hsu, Jeffrey and
104
+ McDonell, Kyle and
105
+ Muennighoff, Niklas and
106
+ Phang, Jason and
107
+ Reynolds, Laria and
108
+ Tang, Eric and
109
+ Thite, Anish and
110
+ Wang, Ben and
111
+ Wang, Kevin and
112
+ Zou, Andy},
113
+ title = {A framework for few-shot language model evaluation},
114
+ month = sep,
115
+ year = 2021,
116
+ publisher = {Zenodo},
117
+ version = {v0.0.1},
118
+ doi = {10.5281/zenodo.5371628},
119
+ url = {https://doi.org/10.5281/zenodo.5371628}
120
+ }
121
  """