File size: 6,887 Bytes
8772b50
 
 
 
 
 
 
 
 
7e055ad
 
f646fc1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8772b50
7e055ad
 
 
 
 
8772b50
 
 
 
 
 
 
7e055ad
8772b50
 
 
 
7e055ad
 
 
 
8772b50
 
7e055ad
f646fc1
8772b50
 
 
7e055ad
 
 
 
8772b50
 
7e055ad
8772b50
 
 
 
7e055ad
 
 
 
8772b50
 
 
 
 
7e055ad
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8772b50
 
 
 
 
 
 
 
 
 
e7d518b
8772b50
 
f646fc1
8772b50
f646fc1
 
 
 
 
 
 
 
 
7e055ad
f646fc1
 
7e055ad
f646fc1
 
 
 
 
 
 
 
 
 
 
7e055ad
f646fc1
 
 
 
 
7e055ad
 
f646fc1
 
 
8772b50
 
 
f646fc1
8772b50
f646fc1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8772b50
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
"""
Data service provider
"""
import json
from typing import List

import pandas as pd

from app.backend.constant import ModelProvider
from utils.cache_decorator import cache_df_with_custom_key, cache_dict_with_custom_key
from utils.http_utils import get

COLUMNS = ['model_name', 'group_name', 'leaderboard', 'dataset_name',
           'embd_dtype', 'embd_dim', 'num_params', 'max_tokens', 'similarity',
           'query_instruct', 'corpus_instruct', 'ndcg_at_1', 'ndcg_at_3', 'ndcg_at_5',
           'ndcg_at_10', 'ndcg_at_20',
           'ndcg_at_50', 'ndcg_at_100', 'recall_at_1', 'recall_at_3',
           'recall_at_5', 'recall_at_10', 'recall_at_20', 'recall_at_50',
           'recall_at_100', 'precision_at_1', 'precision_at_3', 'precision_at_5',
           'precision_at_10', 'precision_at_20', 'precision_at_50',
           'precision_at_100']

COLUMNS_TYPES = ["markdown", "str", 'str', 'str',
                 'str', 'str', 'number', 'number', 'str',
                 'str', 'str', 'number', 'number', 'number',
                 'number', 'number',
                 'number', 'number', 'number', 'number',
                 'number', 'number', 'number', 'number',
                 'number', 'number', 'number', 'number',
                 'number', 'number', 'number',
                 'number']

GIT_URL = "https://raw.githubusercontent.com/embedding-benchmark/ebr/refs/heads/main/results/"
DATASET_URL = f"{GIT_URL}datasets.json"
MODEL_URL = f"{GIT_URL}models.json"
RESULT_URL = f"{GIT_URL}results.json"


class DataEngine:

    def __init__(self):
        self.df = self.init_dataframe()

    @property
    @cache_dict_with_custom_key("models")
    def models(self):
        """
        Get models data
        """
        res = get(MODEL_URL)
        if res.status_code == 200:
            return res.json()
        return {}

    @property
    @cache_dict_with_custom_key("datasets")
    def datasets(self):
        """
        Get tasks data
        """
        res = get(DATASET_URL)
        if res.status_code == 200:
            return res.json()
        return {}

    @property
    @cache_dict_with_custom_key("results")
    def results(self):
        """
        Get results data
        """
        res = get(RESULT_URL)
        if res.status_code == 200:
            return res.json()
        return {}

    def init_dataframe(self):
        """
        Initialize DataFrame
        """
        return self.jsons_to_df()

    def get_data(self):
        """
        Get the full dataset
        """
        df = self.df.copy()
        # 移除指定列
        columns_to_remove = ['group_name', 'leaderboard', 'dataset_name']
        df = df.drop(columns=columns_to_remove)
        # 按 NDCG@10 降序排序
        return df.sort_values(by='ndcg_at_10', ascending=False)

    def get_filtered_data(self, navigation=None, embd_type=None, embd_dims=None, similarity=None):
        """
        Get filtered dataset based on criteria
        """
        filtered_df = self.df.copy()

        if navigation and navigation != "all":
            filtered_df = filtered_df[filtered_df['leaderboard'] == navigation]

        if embd_type and embd_type != "all":
            filtered_df = filtered_df[filtered_df['embd_dtype'] == embd_type]

        if similarity and similarity != "all":
            filtered_df = filtered_df[filtered_df['similarity'] == similarity]

        if embd_dims and isinstance(embd_dims, list) and len(embd_dims) > 0:
            filtered_df = filtered_df[filtered_df['embd_dim'].isin(embd_dims)]

        # 移除指定列
        columns_to_remove = ['group_name', 'leaderboard', 'dataset_name']
        filtered_df = filtered_df.drop(columns=columns_to_remove)
        # 按 NDCG@10 降序排序
        return filtered_df.sort_values(by='ndcg_at_10', ascending=False)

    def _check_providers(self, organization: str, providers: List):
        if not providers:
            return True
        if "Others" in providers:
            if organization not in (
                    ModelProvider.OPENAI.value, ModelProvider.COHERE.value, ModelProvider.VOYAGEAI.value):
                return True
        return organization in providers

    @cache_df_with_custom_key("json_result")
    def jsons_to_df(self):

        results_list = self.results
        df_results_list = []
        for result_dict in results_list:
            dataset_name = result_dict["dataset_name"]
            df_result_row = pd.DataFrame(result_dict["results"])
            df_result_row["dataset_name"] = dataset_name
            df_results_list.append(df_result_row)
        df_result = pd.concat(df_results_list)

        df_datasets_list = []
        for item in self.datasets:
            dataset_names = item["datasets"]
            df_dataset_row = pd.DataFrame(
                {
                    "group_name": [item["name"] for _ in range(len(dataset_names))],
                    "dataset_name": dataset_names,
                    "leaderboard": [item["leaderboard"] for _ in range(len(dataset_names))]
                }
            )
            df_datasets_list.append(df_dataset_row)
        df_dataset = pd.concat(df_datasets_list).drop_duplicates()

        models_list = self.models

        df_model = pd.DataFrame(models_list)

        df = pd.merge(df_result, df_model, on=["model_name", "embd_dim", "embd_dtype"], how="inner")
        df = pd.merge(df, df_dataset, on="dataset_name", how="inner")

        df["model_name"] = df.apply(lambda
                                        x: f"""<a target=\"_blank\" style=\"text-decoration: underline\" href=\"{x["reference"]}\">{x["model_name"]}</a>""",
                                    axis=1)
        if df.empty:
            return pd.DataFrame(columns=COLUMNS)
        return df[COLUMNS]

    def filter_df(self, df_result: pd.DataFrame, embd_dtype: str, embd_dims: List, similarity: str, max_tokens: int):
        """
        filter_by_providers
        """
        if not embd_dims:
            return df_result[0:0]

        if embd_dtype and embd_dtype != "all":
            df_result = df_result[df_result['embd_dtype'] == embd_dtype][:]

        if similarity and similarity != "all":
            df_result = df_result[df_result['similarity'] == similarity][:]

        if max_tokens:
            df_result = df_result[df_result['max_tokens'] >= max_tokens][:]

        if embd_dims:
            bins = [0, 1000, 2000, 5000, float('inf')]
            labels = ['<=1k', '1k-2k', '2k-5k', '>=5k']

            # 使用 pd.cut 进行分组
            df_result['value_group'] = pd.cut(df_result['embd_dim'], bins=bins, labels=labels, right=False)
            df_result = df_result[df_result['value_group'].isin(embd_dims)]
            df_result = df_result[COLUMNS]

        return df_result

    def summarize_dataframe(self):
        """
        Summarize data statistics
        """