Spaces:

Aiera
/

aiera-finance-leaderboard

Running

App Files Files Community

Jacqueline Garrahan commited on Jul 24

Commit

57dbb7f

•

1 Parent(s): 0ccfdb5

check in about

Browse files

Files changed (1) hide show

src/about.py +53 -10

src/about.py CHANGED Viewed

@@ -13,10 +13,11 @@ class Task:
 # ---------------------------------------------------
 class Tasks(Enum):
     # task_key in the json file, metric_key in the json file, name to display in the leaderboard
-    task0 = Task("aiera_speaker_assign", "accuracy,none", "Speaker ID", reference_url="https://huggingface.co/datasets/Aiera/aiera-speaker-assign")
-    task1 = Task("aiera_transcript_sentiment", "accuracy,none","Sentiment", reference_url="https://huggingface.co/datasets/Aiera/aiera-transcript-sentiment")
-    task4 = Task("aiera_ect_sum", "rougeLsum,none","aiera_ect_sum", reference_url="https://huggingface.co/datasets/Aiera/aiera-ect-sum")
-    task5 = Task("finqa", "exact_match_manual,none","finqa", reference_url="https://huggingface.co/datasets/Aiera/finqa-verified")
     #task7 = Task("flare_ner", "accuracy,none","flare-ner", reference_url="test")
@@ -29,17 +30,26 @@ TITLE = """<h1 align="center" id="space-title">Aiera Leaderboard</h1>"""
 # What does your leaderboard evaluate?
 INTRODUCTION_TEXT = """
-The Aiera Financial Leaderboard evaluates the performance of LLMs on ...
 """
 # Which evaluations are you running? how can people reproduce what you have?
 LLM_BENCHMARKS_TEXT = f"""
 ## How it works
-Proof something happened
 ## Reproducibility
-A guide for running Aiera's tasks using EleutherAi's lm-evaluation-harness is avaliable on github at [https://github.com/aiera-inc/aiera-benchmark-tasks](https://github.com/aiera-inc/aiera-benchmark-tasks).
 """
@@ -70,9 +80,42 @@ When we add extra information about models to the leaderboard, it will be automa
 ## In case of model failure
 If your model is displayed in the `FAILED` category, its execution stopped.
 Make sure you have followed the above steps first.
-If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
 """
 CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
-CITATION_BUTTON_TEXT = r"""
 """

 # ---------------------------------------------------
 class Tasks(Enum):
     # task_key in the json file, metric_key in the json file, name to display in the leaderboard
+    task0 = Task("aiera_transcript_sentiment", "accuracy,none","Sentiment", reference_url="https://huggingface.co/datasets/Aiera/aiera-transcript-sentiment")
+    task1 = Task("aiera_ect_sum", "rougeLsum,none","Summary", reference_url="https://huggingface.co/datasets/Aiera/aiera-ect-sum")
+    task2 = Task("finqa", "exact_match_manual,none","Q&A", reference_url="https://huggingface.co/datasets/Aiera/finqa-verified")
+    task3 = Task("aiera_speaker_assign", "accuracy,none", "Speaker ID", reference_url="https://huggingface.co/datasets/Aiera/aiera-speaker-assign")
     #task7 = Task("flare_ner", "accuracy,none","flare-ner", reference_url="test")
 # What does your leaderboard evaluate?
 INTRODUCTION_TEXT = """
+The Aiera Leaderboard evaluates the performance of LLMs on a number of financial intelligence tasks including:
+* Assignments of speakers for event transcript segments and identification of speaker changes.
+* Abstractive summarizations of earnings call transcripts.
+* Calculation-based Q&A over financial text.
+* Financial sentiment tagging for transcript segments.
+A guide for eval tasks is avaliable on github at [https://github.com/aiera-inc/aiera-benchmark-tasks](https://github.com/aiera-inc/aiera-benchmark-tasks).
 """
 # Which evaluations are you running? how can people reproduce what you have?
 LLM_BENCHMARKS_TEXT = f"""
 ## How it works
+Models are evaluated on the following tasks
+* **aiera_speaker_assign**: Assignments of speakers for event transcript segments and identification of speaker changes. Dataset available on [huggingface](https://huggingface.co/datasets/Aiera/aiera-speaker-assign).
+* **aiera-ect-sum**: Abstractive summarizations of earnings call transcripts. Dataset available on [huggingface](https://huggingface.co/datasets/Aiera/aiera-ect-sum).
+* **finqa**: Calculation-based Q&A over financial text. Dataset available on [huggingface](https://huggingface.co/datasets/Aiera/finqa-verified).
+* **aiera-transcript-sentiment**: Event transcript segments with labels indicating the financial sentiment. Dataset available on [huggingface](https://huggingface.co/datasets/Aiera/aiera-transcript-sentiment).
 ## Reproducibility
+A guide for running the above tasks using EleutherAi's lm-evaluation-harness is avaliable on github at [https://github.com/aiera-inc/aiera-benchmark-tasks](https://github.com/aiera-inc/aiera-benchmark-tasks).
 """
 ## In case of model failure
 If your model is displayed in the `FAILED` category, its execution stopped.
 Make sure you have followed the above steps first.
+If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task). A guide for running the Aiera's tasks using EleutherAi's lm-evaluation-harness is avaliable on github at [https://github.com/aiera-inc/aiera-benchmark-tasks](https://github.com/aiera-inc/aiera-benchmark-tasks).
 """
 CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
+CITATION_BUTTON_TEXT = r"""@misc{aiera-finance-leaderboard,
+  author = {Jacqueline Garrahan, Bryan Healey},
+  title = {Aiera Finance Leaderboard},
+  year = {2024},
+  publisher = {Aiera},
+  howpublished = "\url{https://huggingface.co/spaces/Aiera/aiera-finance-leaderboard}"
+}
+@software{eval-harness,
+  author       = {Gao, Leo and
+                  Tow, Jonathan and
+                  Biderman, Stella and
+                  Black, Sid and
+                  DiPofi, Anthony and
+                  Foster, Charles and
+                  Golding, Laurence and
+                  Hsu, Jeffrey and
+                  McDonell, Kyle and
+                  Muennighoff, Niklas and
+                  Phang, Jason and
+                  Reynolds, Laria and
+                  Tang, Eric and
+                  Thite, Anish and
+                  Wang, Ben and
+                  Wang, Kevin and
+                  Zou, Andy},
+  title        = {A framework for few-shot language model evaluation},
+  month        = sep,
+  year         = 2021,
+  publisher    = {Zenodo},
+  version      = {v0.0.1},
+  doi          = {10.5281/zenodo.5371628},
+  url          = {https://doi.org/10.5281/zenodo.5371628}
+}
 """