Spaces:

finosfoundation
/

Open-Financial-LLM-Leaderboard

Running

App Files Files Community

Colin Lin commited on Sep 2, 2024

Commit

8b37d0c

unverified ·

2 Parent(s): 02a46a8 32a84b3

Merge pull request #4 from miragecoa/main

Browse files

Files changed (3) hide show

src/about.py +2 -2
src/leaderboard/read_evals.py +9 -2
src/populate.py +20 -2

src/about.py CHANGED Viewed

@@ -52,7 +52,7 @@ class Tasks(Enum):
     task53 = Task("EFPA", "F1", "EFPA", category="Spanish")
     task54 = Task("FinanceES", "F1", "FinanceES", category="Spanish")
     task55 = Task("TSA-Spanish", "F1", "TSA-Spanish", category="Spanish")
-    task56 = Task("FinTrade", "CR", "FinTrade", category="Decision-Making (DM)")
 NUM_FEWSHOT = 0  # Change with your few shot
 # ---------------------------------------------------
@@ -140,7 +140,7 @@ Our evaluation metrics include, but are not limited to, Accuracy, F1 Score, ROUG
 - **EFPA**: F1. Financial argument classification in Spanish. This dataset requires the classification of arguments in Spanish financial documents, focusing on identifying claims, evidence, and other argumentative structures.
 - **FinanceES**: F1. Financial sentiment classification in Spanish. The task involves classifying sentiment in a broad range of Spanish financial documents, including news articles and reports. It tests the model's ability to adapt sentiment analysis techniques to a non-English language.
 - **TSA-Spanish**: F1. Sentiment analysis in Spanish. This dataset involves sentiment analysis on Spanish-language tweets and short texts, similar to the English TSA dataset but tailored for Spanish speakers. It evaluates the model's ability to process and analyze sentiment in social media content.
-- **FinTrade**: CR, SR, DV, AV, MD. Stock trading dataset. FinTrade is a novel dataset developed specifically for evaluating stock trading tasks using LLMs. It incorporates historical stock prices, financial news, and sentiment data from 10 different stocks over a year. This dataset is designed to simulate real-world trading scenarios, allowing models to perform agent-based financial trading. The task evaluates the models on multiple financial metrics such as Cumulative Return (CR), Sharpe Ratio (SR), Daily Volatility (DV), Annualized Volatility (AV), and Maximum Drawdown (MD). These metrics provide a comprehensive assessment of the model's profitability, risk management, and decision-making capabilities.

     task53 = Task("EFPA", "F1", "EFPA", category="Spanish")
     task54 = Task("FinanceES", "F1", "FinanceES", category="Spanish")
     task55 = Task("TSA-Spanish", "F1", "TSA-Spanish", category="Spanish")
+    task56 = Task("FinTrade", "SR", "FinTrade", category="Decision-Making (DM)")
 NUM_FEWSHOT = 0  # Change with your few shot
 # ---------------------------------------------------
 - **EFPA**: F1. Financial argument classification in Spanish. This dataset requires the classification of arguments in Spanish financial documents, focusing on identifying claims, evidence, and other argumentative structures.
 - **FinanceES**: F1. Financial sentiment classification in Spanish. The task involves classifying sentiment in a broad range of Spanish financial documents, including news articles and reports. It tests the model's ability to adapt sentiment analysis techniques to a non-English language.
 - **TSA-Spanish**: F1. Sentiment analysis in Spanish. This dataset involves sentiment analysis on Spanish-language tweets and short texts, similar to the English TSA dataset but tailored for Spanish speakers. It evaluates the model's ability to process and analyze sentiment in social media content.
+- **FinTrade**: SR. Stock trading dataset. FinTrade is a novel dataset developed specifically for evaluating stock trading tasks using LLMs. It incorporates historical stock prices, financial news, and sentiment data from 10 different stocks over a year. This dataset is designed to simulate real-world trading scenarios, allowing models to perform agent-based financial trading. The task evaluates the models on multiple financial metrics such as Cumulative Return (CR), Sharpe Ratio (SR), Daily Volatility (DV), Annualized Volatility (AV), and Maximum Drawdown (MD). These metrics provide a comprehensive assessment of the model's profitability, risk management, and decision-making capabilities.

src/leaderboard/read_evals.py CHANGED Viewed

@@ -37,6 +37,7 @@ class EvalResult:
     def init_from_json_file(self, json_filepath):
         """Inits the result from the specific model result file"""
         with open(json_filepath) as fp:
             data = json.load(fp)
         config = data.get("config")
@@ -149,11 +150,17 @@ class EvalResult:
                 elif task.value.category == "Text Generation (TG)":
                     category_averages["average_TG"].append(score)
                 elif task.value.category == "Risk Management (RM)":
-                    category_averages["average_RM"].append((score + 100) / 2)
                 elif task.value.category == "Forecasting (FO)":
                     category_averages["average_FO"].append(score)
                 elif task.value.category == "Decision-Making (DM)":
-                    category_averages["average_DM"].append(score)
                 elif task.value.category == "Spanish":
                     category_averages["average_Spanish"].append(score)

     def init_from_json_file(self, json_filepath):
         """Inits the result from the specific model result file"""
         with open(json_filepath) as fp:
+            print(json_filepath)
             data = json.load(fp)
         config = data.get("config")
                 elif task.value.category == "Text Generation (TG)":
                     category_averages["average_TG"].append(score)
                 elif task.value.category == "Risk Management (RM)":
+                    if score == "missing":
+                        category_averages["average_RM"].append(score)
+                    else:
+                        category_averages["average_RM"].append((score + 100) / 2)
                 elif task.value.category == "Forecasting (FO)":
                     category_averages["average_FO"].append(score)
                 elif task.value.category == "Decision-Making (DM)":
+                    if task.value.benchmark == "FinTrade" and score != "missing":
+                        category_averages["average_DM"].append((score + 3)/6)
+                    else:
+                        category_averages["average_DM"].append(score)
                 elif task.value.category == "Spanish":
                     category_averages["average_Spanish"].append(score)

src/populate.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import json
 import os
 import pandas as pd
 from src.display.formatting import has_no_nan_values, make_clickable_model
 from src.display.utils import AutoEvalColumn, EvalQueueColumn
@@ -35,13 +36,30 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
     mcc_tasks = ["German", "Australian", "LendingClub", "ccf", "ccfraud", "polish", "taiwan", "portoseguro", "travelinsurance"]
     for task in mcc_tasks:
         if task in df.columns:
-            df[task] = (df[task] + 100) / 2.0
     # Now, select the columns that were passed to the function
-    df = df[cols].round(decimals=2)
     # Filter out if any of the benchmarks have not been produced
     df = df[has_no_nan_values(df, benchmark_cols)]
     return raw_data, df

 import json
 import os
 import pandas as pd
+import numpy as np
 from src.display.formatting import has_no_nan_values, make_clickable_model
 from src.display.utils import AutoEvalColumn, EvalQueueColumn
     mcc_tasks = ["German", "Australian", "LendingClub", "ccf", "ccfraud", "polish", "taiwan", "portoseguro", "travelinsurance"]
     for task in mcc_tasks:
         if task in df.columns:
+            df[task] = df.apply(lambda row: (row[task] + 100) / 2.0 if row[task] != "missing" else row[task], axis=1)
+    for index, row in df.iterrows():
+        if "FinTrade" in row and row["FinTrade"] != "missing":
+            df.loc[index, "FinTrade"] = (row["FinTrade"] + 3) / 6
     # Now, select the columns that were passed to the function
+    df = df[cols]
+    # Function to round numeric values, including those in string format
+    def round_numeric(x):
+        try:
+            return round(float(x), 1)
+        except ValueError:
+            return x
+    # Apply rounding to all columns except 'T' and 'Model'
+    for col in df.columns:
+        if col not in ['T', 'Model']:
+            df[col] = df[col].apply(round_numeric)
     # Filter out if any of the benchmarks have not been produced
     df = df[has_no_nan_values(df, benchmark_cols)]
     return raw_data, df