yixuantt commited on
Commit
81bcfbe
·
verified ·
1 Parent(s): 141271c

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +229 -0
  2. benchmark.xlsx +0 -0
  3. task_metadata.py +94 -0
app.py ADDED
@@ -0,0 +1,229 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import gradio as gr
3
+ from collections import defaultdict
4
+
5
+ def parse_excel(file_path):
6
+ xls = pd.ExcelFile(file_path)
7
+
8
+ task_data = defaultdict(lambda: defaultdict(dict))
9
+ all_models = set()
10
+ all_datasets = defaultdict(set)
11
+ model_urls = {} # 存储模型URL
12
+
13
+ for sheet_name in xls.sheet_names:
14
+ if '_' not in sheet_name:
15
+ continue
16
+
17
+ task_name, lang = sheet_name.rsplit('_', 1)
18
+ if lang not in ['en', 'zh']:
19
+ continue
20
+
21
+ df = xls.parse(sheet_name)
22
+
23
+ has_url = 'URL' in df.columns
24
+ urls = df['URL'].tolist() if has_url else [None] * len(df)
25
+
26
+ models = df.iloc[:, 0].tolist()
27
+ datasets = [col for col in df.columns[1:] if col != 'URL'] if has_url else df.columns[1:].tolist()
28
+
29
+ for model, url in zip(models, urls):
30
+ if url and pd.notnull(url):
31
+ model_urls[model] = url
32
+
33
+ all_models.update(models)
34
+ all_datasets[task_name].update([(d, lang) for d in datasets])
35
+
36
+ for idx, row in df.iterrows():
37
+ model = row.iloc[0]
38
+ scores = row[datasets].tolist() if datasets else []
39
+ task_data[task_name][lang][model] = dict(zip(datasets, scores))
40
+
41
+ return task_data, sorted(all_models), dict(all_datasets), model_urls
42
+
43
+ def calculate_averages(task_data, all_models):
44
+ lang_overall_avg = defaultdict(lambda: defaultdict(list))
45
+ task_lang_avg = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
46
+
47
+ for task, langs in task_data.items():
48
+ for lang, models in langs.items():
49
+ for model in all_models:
50
+ if model in models:
51
+ scores = list(models[model].values())
52
+ lang_overall_avg[lang][model].extend(scores)
53
+ task_lang_avg[task][lang][model].extend(scores)
54
+
55
+ overall = {
56
+ lang: {
57
+ model: sum(scores)/len(scores) if scores else 0.0
58
+ for model, scores in models.items()
59
+ }
60
+ for lang, models in lang_overall_avg.items()
61
+ }
62
+
63
+ processed_task_avg = defaultdict(dict)
64
+ for task, langs in task_lang_avg.items():
65
+ for lang, models in langs.items():
66
+ processed_task_avg[task][lang] = {
67
+ model: sum(scores)/len(scores) if scores else 0.0
68
+ for model, scores in models.items()
69
+ }
70
+
71
+ return overall, processed_task_avg
72
+
73
+ def filter_models(search_term):
74
+ if not search_term:
75
+ return all_models
76
+ return [m for m in all_models if search_term.lower() in m.lower()]
77
+
78
+ def create_lang_view(lang, models):
79
+ model_links = [
80
+ f'<a href="{model_urls.get(m, "#")}" target="_blank">{m}</a>'
81
+ if model_urls.get(m) else m
82
+ for m in models
83
+ ]
84
+
85
+ df_data = {
86
+ "Model": model_links,
87
+ f"Overall ({lang.upper()})": [
88
+ round(overall_avg[lang].get(m, 0), 3)
89
+ for m in models
90
+ ]
91
+ }
92
+
93
+ for task in sorted(task_avg.keys()):
94
+ task_scores = []
95
+ for m in models:
96
+ score = task_avg[task].get(lang, {}).get(m, 0)
97
+ task_scores.append(round(score, 3))
98
+ df_data[task] = task_scores
99
+
100
+ df = pd.DataFrame(df_data)
101
+
102
+ if not df.empty:
103
+ numeric_cols = df.columns[df.columns != "Model"]
104
+ df = df[~(df[numeric_cols] == 0).all(axis=1)]
105
+ df = df.sort_values(by=f"Overall ({lang.upper()})", ascending=False)
106
+ df.reset_index(drop=True, inplace=True)
107
+
108
+ return df if not df.empty else pd.DataFrame({"Status": [f"No {lang.upper()} data matching criteria..."]})
109
+
110
+ def create_overall_view(search_term=None):
111
+ filtered_models = filter_models(search_term)
112
+
113
+ en_df = create_lang_view('en', filtered_models)
114
+ zh_df = create_lang_view('zh', filtered_models)
115
+
116
+ return en_df, zh_df
117
+
118
+ def create_task_view(task_name, search_term=None):
119
+ task_langs = task_data.get(task_name, {})
120
+ dfs = []
121
+
122
+ filtered_models = filter_models(search_term)
123
+
124
+ model_links = [
125
+ f'<a href="{model_urls.get(m, "#")}" target="_blank">{m}</a>'
126
+ if model_urls.get(m) else m
127
+ for m in filtered_models
128
+ ]
129
+
130
+ for lang in ['en', 'zh']:
131
+ lang_data = task_langs.get(lang, {})
132
+ datasets = []
133
+
134
+ if lang_data:
135
+ models_in_lang = list(lang_data.keys())
136
+ if models_in_lang:
137
+ datasets = sorted(lang_data[models_in_lang[0]].keys())
138
+
139
+ df = pd.DataFrame(columns=["Model", "Avg."] + datasets)
140
+
141
+ for i, model in enumerate(filtered_models):
142
+ row_data = {"Model": model_links[i]}
143
+ scores = []
144
+ if model in lang_data:
145
+ for ds in datasets:
146
+ score = lang_data[model].get(ds, 0.0)
147
+ row_data[ds] = round(score, 3)
148
+ scores.append(score)
149
+ row_data["Avg."] = round(sum(scores)/len(scores) if scores else 0.0, 3)
150
+ else:
151
+ row_data.update({ds: 0.0 for ds in datasets})
152
+ row_data["Avg."] = 0.0
153
+ df = pd.concat([df, pd.DataFrame([row_data])], ignore_index=True)
154
+
155
+ if datasets:
156
+ df = df[["Model", "Avg."] + datasets]
157
+ numeric_cols = df.columns[df.columns != "Model"]
158
+ if not numeric_cols.empty:
159
+ df = df[~(df[numeric_cols] == 0).all(axis=1)]
160
+ df = df.sort_values(by="Avg.", ascending=False)
161
+ df.reset_index(drop=True, inplace=True)
162
+ else:
163
+ df = pd.DataFrame({"Status": ["There is no data for this language.."]})
164
+
165
+ dfs.append(df)
166
+
167
+ return dfs
168
+
169
+ task_data, all_models, all_datasets, model_urls = parse_excel('benchmark.xlsx')
170
+ overall_avg, task_avg = calculate_averages(task_data, all_models)
171
+
172
+ with gr.Blocks(title="Benchmark Leaderboard", css=""".search-box {margin-bottom: 20px}
173
+ .gradio-container {max-width: 100% !important}
174
+ .dataframe {width: 100% !important}""") as demo:
175
+ gr.Markdown("# 💰 FinMTEB Benchmark Leaderboard")
176
+ gr.Markdown("**Finance** Massive Text Embedding Benchmark (FinMTEB), an embedding benchmark consists of 64 financial domain-specific text datasets, across English and Chinese, spanning seven different tasks.")
177
+ gr.Markdown("---")
178
+ gr.Markdown("📖 If you feel our work helpful, please cite the following paper: [Do We Need Domain-Specific Embedding Models? An Empirical Investigation](https://arxiv.org/pdf/2409.18511v1)")
179
+ gr.Markdown("Github: [FinMTEB](https://github.com/yixuantt/FinMTEB/blob/main/README.md)")
180
+ search = gr.Textbox(
181
+ placeholder="🔍 Enter the model name...",
182
+ label="model_search",
183
+ show_label=False,
184
+ elem_classes=["search-box"]
185
+ )
186
+
187
+ with gr.Tabs() as main_tabs:
188
+ with gr.Tab("📊 Overview"):
189
+ with gr.Column(elem_classes=["lang-section"]):
190
+ gr.Markdown("### English Datasets")
191
+ en_table = gr.DataFrame(interactive=False,datatype=["markdown", "markdown", "html"])
192
+ with gr.Column(elem_classes=["lang-section"]):
193
+ gr.Markdown("### Chinese Datasets")
194
+ zh_table = gr.DataFrame(interactive=False,datatype=["markdown", "markdown", "html"])
195
+
196
+ search.change(
197
+ create_overall_view,
198
+ inputs=search,
199
+ outputs=[en_table, zh_table]
200
+ )
201
+ demo.load(
202
+ lambda: create_overall_view(),
203
+ outputs=[en_table, zh_table]
204
+ )
205
+
206
+ for task_name in task_data:
207
+ with gr.Tab(task_name):
208
+ with gr.Column():
209
+ gr.Markdown("### English Datasets")
210
+ en_display = gr.DataFrame(interactive=False,datatype=["markdown", "markdown", "html"])
211
+ with gr.Column():
212
+ gr.Markdown("### Chinese Datasets")
213
+ zh_display = gr.DataFrame(interactive=False,datatype=["markdown", "markdown", "html"])
214
+
215
+ search.change(
216
+ lambda term, tn=task_name: create_task_view(tn, term),
217
+ inputs=search,
218
+ outputs=[en_display, zh_display]
219
+ )
220
+ demo.load(
221
+ lambda tn=task_name: create_task_view(tn),
222
+ outputs=[en_display, zh_display]
223
+ )
224
+ with gr.Tab("📬 Submit"):
225
+ gr.Markdown("---")
226
+ gr.Markdown("For the results report, please send the results to **[email protected]**")
227
+ gr.Markdown("😊 Thanks for your contribution!")
228
+ if __name__ == "__main__":
229
+ demo.launch()
benchmark.xlsx ADDED
Binary file (44.3 kB). View file
 
task_metadata.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ TASK_LIST_STS = {
3
+ "en":["FINAL",
4
+ "FinSTS"],
5
+ "zh":["AFQMC",
6
+ "BQCorpus"]
7
+ }
8
+
9
+
10
+ TASK_LIST_CLASSIFICATION = {
11
+ "en":[
12
+ "FinancialPhraseBankClassification",
13
+ "FinSentClassification",
14
+ "FiQAClassification",
15
+ "SemEva2017Classification",
16
+ "FLSClassification",
17
+ "ESGClassification",
18
+ "FOMCClassification",
19
+ "FinancialFraudClassification",
20
+ ],
21
+ "zh":[
22
+ "FinNSPClassification",
23
+ "FinChinaSentimentClassification",
24
+ "FinFEClassification",
25
+ "OpenFinDataSentimentClassification",
26
+ "Weibo21Classification"
27
+ ]
28
+ }
29
+
30
+ TASK_LIST_RETRIEVAL = {
31
+ "en":[
32
+ "FiQA2018Retrieval",
33
+ "FinanceBenchRetrieval",
34
+ "HC3Retrieval",
35
+ "Apple10KRetrieval",
36
+ "FinQARetrieval",
37
+ "TATQARetrieval",
38
+ "USNewsRetrieval",
39
+ "TradeTheEventEncyclopediaRetrieval",
40
+ "TradeTheEventNewsRetrieval",
41
+ "TheGoldmanEnRetrieval"],
42
+ "zh":[
43
+ "FinTruthQARetrieval",
44
+ "FinEvaRetrieval",
45
+ "AlphaFinRetrieval",
46
+ "DISCFinLLMRetrieval",
47
+ "DISCFinLLMComputingRetrieval",
48
+ "DuEEFinRetrieval",
49
+ "SmoothNLPRetrieval",
50
+ "THUCNewsRetrieval",
51
+ "FinEvaEncyclopediaRetrieval",
52
+ "TheGoldmanZhRetrieval"
53
+ ]
54
+ }
55
+
56
+ TASK_LIST_CLUSTERING = {
57
+ "en":["MInDS14EnClustering",
58
+ "ComplaintsClustering",
59
+ "PiiClustering",
60
+ "FinanceArxivS2SClustering",
61
+ "FinanceArxivP2PClustering",
62
+ "WikiCompany2IndustryClustering",
63
+ ],
64
+ "zh":["MInDS14ZhClustering",
65
+ "FinNLClustering",
66
+ "CCKS2022Clustering",
67
+ "CCKS2020Clustering",
68
+ "CCKS2019Clustering"]
69
+ }
70
+
71
+ TASK_LIST_RERANKING = {
72
+ "en":["FinFactReranking",
73
+ "FiQA2018Reranking",
74
+ "HC3Reranking",],
75
+ "zh":["FinEvaReranking",
76
+ "DISCFinLLMReranking"]
77
+ }
78
+
79
+ TASK_LIST_SUM = {
80
+ "en":["Ectsum",
81
+ "FINDsum",
82
+ "FNS2022sum"],
83
+ "zh":["FiNNAsum",
84
+ "FinEvaHeadlinesum",
85
+ "FinEvasum"]
86
+ }
87
+
88
+ TASK_LIST_PAIRCLASSIFICATION = {
89
+ "en":["HeadlineACPairClassification",
90
+ "HeadlinePDDPairClassification",
91
+ "HeadlinePDUPairClassification",],
92
+ "zh":["AFQMCPairClassification"]
93
+ }
94
+