Colin Lin commited on
Commit
8b37d0c
2 Parent(s): 02a46a8 32a84b3

Merge pull request #4 from miragecoa/main

Browse files
Files changed (3) hide show
  1. src/about.py +2 -2
  2. src/leaderboard/read_evals.py +9 -2
  3. src/populate.py +20 -2
src/about.py CHANGED
@@ -52,7 +52,7 @@ class Tasks(Enum):
52
  task53 = Task("EFPA", "F1", "EFPA", category="Spanish")
53
  task54 = Task("FinanceES", "F1", "FinanceES", category="Spanish")
54
  task55 = Task("TSA-Spanish", "F1", "TSA-Spanish", category="Spanish")
55
- task56 = Task("FinTrade", "CR", "FinTrade", category="Decision-Making (DM)")
56
 
57
  NUM_FEWSHOT = 0 # Change with your few shot
58
  # ---------------------------------------------------
@@ -140,7 +140,7 @@ Our evaluation metrics include, but are not limited to, Accuracy, F1 Score, ROUG
140
  - **EFPA**: F1. Financial argument classification in Spanish. This dataset requires the classification of arguments in Spanish financial documents, focusing on identifying claims, evidence, and other argumentative structures.
141
  - **FinanceES**: F1. Financial sentiment classification in Spanish. The task involves classifying sentiment in a broad range of Spanish financial documents, including news articles and reports. It tests the model's ability to adapt sentiment analysis techniques to a non-English language.
142
  - **TSA-Spanish**: F1. Sentiment analysis in Spanish. This dataset involves sentiment analysis on Spanish-language tweets and short texts, similar to the English TSA dataset but tailored for Spanish speakers. It evaluates the model's ability to process and analyze sentiment in social media content.
143
- - **FinTrade**: CR, SR, DV, AV, MD. Stock trading dataset. FinTrade is a novel dataset developed specifically for evaluating stock trading tasks using LLMs. It incorporates historical stock prices, financial news, and sentiment data from 10 different stocks over a year. This dataset is designed to simulate real-world trading scenarios, allowing models to perform agent-based financial trading. The task evaluates the models on multiple financial metrics such as Cumulative Return (CR), Sharpe Ratio (SR), Daily Volatility (DV), Annualized Volatility (AV), and Maximum Drawdown (MD). These metrics provide a comprehensive assessment of the model's profitability, risk management, and decision-making capabilities.
144
 
145
 
146
 
 
52
  task53 = Task("EFPA", "F1", "EFPA", category="Spanish")
53
  task54 = Task("FinanceES", "F1", "FinanceES", category="Spanish")
54
  task55 = Task("TSA-Spanish", "F1", "TSA-Spanish", category="Spanish")
55
+ task56 = Task("FinTrade", "SR", "FinTrade", category="Decision-Making (DM)")
56
 
57
  NUM_FEWSHOT = 0 # Change with your few shot
58
  # ---------------------------------------------------
 
140
  - **EFPA**: F1. Financial argument classification in Spanish. This dataset requires the classification of arguments in Spanish financial documents, focusing on identifying claims, evidence, and other argumentative structures.
141
  - **FinanceES**: F1. Financial sentiment classification in Spanish. The task involves classifying sentiment in a broad range of Spanish financial documents, including news articles and reports. It tests the model's ability to adapt sentiment analysis techniques to a non-English language.
142
  - **TSA-Spanish**: F1. Sentiment analysis in Spanish. This dataset involves sentiment analysis on Spanish-language tweets and short texts, similar to the English TSA dataset but tailored for Spanish speakers. It evaluates the model's ability to process and analyze sentiment in social media content.
143
+ - **FinTrade**: SR. Stock trading dataset. FinTrade is a novel dataset developed specifically for evaluating stock trading tasks using LLMs. It incorporates historical stock prices, financial news, and sentiment data from 10 different stocks over a year. This dataset is designed to simulate real-world trading scenarios, allowing models to perform agent-based financial trading. The task evaluates the models on multiple financial metrics such as Cumulative Return (CR), Sharpe Ratio (SR), Daily Volatility (DV), Annualized Volatility (AV), and Maximum Drawdown (MD). These metrics provide a comprehensive assessment of the model's profitability, risk management, and decision-making capabilities.
144
 
145
 
146
 
src/leaderboard/read_evals.py CHANGED
@@ -37,6 +37,7 @@ class EvalResult:
37
  def init_from_json_file(self, json_filepath):
38
  """Inits the result from the specific model result file"""
39
  with open(json_filepath) as fp:
 
40
  data = json.load(fp)
41
 
42
  config = data.get("config")
@@ -149,11 +150,17 @@ class EvalResult:
149
  elif task.value.category == "Text Generation (TG)":
150
  category_averages["average_TG"].append(score)
151
  elif task.value.category == "Risk Management (RM)":
152
- category_averages["average_RM"].append((score + 100) / 2)
 
 
 
153
  elif task.value.category == "Forecasting (FO)":
154
  category_averages["average_FO"].append(score)
155
  elif task.value.category == "Decision-Making (DM)":
156
- category_averages["average_DM"].append(score)
 
 
 
157
  elif task.value.category == "Spanish":
158
  category_averages["average_Spanish"].append(score)
159
 
 
37
  def init_from_json_file(self, json_filepath):
38
  """Inits the result from the specific model result file"""
39
  with open(json_filepath) as fp:
40
+ print(json_filepath)
41
  data = json.load(fp)
42
 
43
  config = data.get("config")
 
150
  elif task.value.category == "Text Generation (TG)":
151
  category_averages["average_TG"].append(score)
152
  elif task.value.category == "Risk Management (RM)":
153
+ if score == "missing":
154
+ category_averages["average_RM"].append(score)
155
+ else:
156
+ category_averages["average_RM"].append((score + 100) / 2)
157
  elif task.value.category == "Forecasting (FO)":
158
  category_averages["average_FO"].append(score)
159
  elif task.value.category == "Decision-Making (DM)":
160
+ if task.value.benchmark == "FinTrade" and score != "missing":
161
+ category_averages["average_DM"].append((score + 3)/6)
162
+ else:
163
+ category_averages["average_DM"].append(score)
164
  elif task.value.category == "Spanish":
165
  category_averages["average_Spanish"].append(score)
166
 
src/populate.py CHANGED
@@ -1,6 +1,7 @@
1
  import json
2
  import os
3
  import pandas as pd
 
4
 
5
  from src.display.formatting import has_no_nan_values, make_clickable_model
6
  from src.display.utils import AutoEvalColumn, EvalQueueColumn
@@ -35,13 +36,30 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
35
  mcc_tasks = ["German", "Australian", "LendingClub", "ccf", "ccfraud", "polish", "taiwan", "portoseguro", "travelinsurance"]
36
  for task in mcc_tasks:
37
  if task in df.columns:
38
- df[task] = (df[task] + 100) / 2.0
 
 
 
 
39
 
40
  # Now, select the columns that were passed to the function
41
- df = df[cols].round(decimals=2)
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
  # Filter out if any of the benchmarks have not been produced
44
  df = df[has_no_nan_values(df, benchmark_cols)]
 
45
  return raw_data, df
46
 
47
 
 
1
  import json
2
  import os
3
  import pandas as pd
4
+ import numpy as np
5
 
6
  from src.display.formatting import has_no_nan_values, make_clickable_model
7
  from src.display.utils import AutoEvalColumn, EvalQueueColumn
 
36
  mcc_tasks = ["German", "Australian", "LendingClub", "ccf", "ccfraud", "polish", "taiwan", "portoseguro", "travelinsurance"]
37
  for task in mcc_tasks:
38
  if task in df.columns:
39
+ df[task] = df.apply(lambda row: (row[task] + 100) / 2.0 if row[task] != "missing" else row[task], axis=1)
40
+
41
+ for index, row in df.iterrows():
42
+ if "FinTrade" in row and row["FinTrade"] != "missing":
43
+ df.loc[index, "FinTrade"] = (row["FinTrade"] + 3) / 6
44
 
45
  # Now, select the columns that were passed to the function
46
+ df = df[cols]
47
+
48
+ # Function to round numeric values, including those in string format
49
+ def round_numeric(x):
50
+ try:
51
+ return round(float(x), 1)
52
+ except ValueError:
53
+ return x
54
+
55
+ # Apply rounding to all columns except 'T' and 'Model'
56
+ for col in df.columns:
57
+ if col not in ['T', 'Model']:
58
+ df[col] = df[col].apply(round_numeric)
59
 
60
  # Filter out if any of the benchmarks have not been produced
61
  df = df[has_no_nan_values(df, benchmark_cols)]
62
+
63
  return raw_data, df
64
 
65