Spaces:
No application file
No application file
File size: 6,887 Bytes
8772b50 7e055ad f646fc1 8772b50 7e055ad 8772b50 7e055ad 8772b50 7e055ad 8772b50 7e055ad f646fc1 8772b50 7e055ad 8772b50 7e055ad 8772b50 7e055ad 8772b50 7e055ad 8772b50 e7d518b 8772b50 f646fc1 8772b50 f646fc1 7e055ad f646fc1 7e055ad f646fc1 7e055ad f646fc1 7e055ad f646fc1 8772b50 f646fc1 8772b50 f646fc1 8772b50 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 |
"""
Data service provider
"""
import json
from typing import List
import pandas as pd
from app.backend.constant import ModelProvider
from utils.cache_decorator import cache_df_with_custom_key, cache_dict_with_custom_key
from utils.http_utils import get
COLUMNS = ['model_name', 'group_name', 'leaderboard', 'dataset_name',
'embd_dtype', 'embd_dim', 'num_params', 'max_tokens', 'similarity',
'query_instruct', 'corpus_instruct', 'ndcg_at_1', 'ndcg_at_3', 'ndcg_at_5',
'ndcg_at_10', 'ndcg_at_20',
'ndcg_at_50', 'ndcg_at_100', 'recall_at_1', 'recall_at_3',
'recall_at_5', 'recall_at_10', 'recall_at_20', 'recall_at_50',
'recall_at_100', 'precision_at_1', 'precision_at_3', 'precision_at_5',
'precision_at_10', 'precision_at_20', 'precision_at_50',
'precision_at_100']
COLUMNS_TYPES = ["markdown", "str", 'str', 'str',
'str', 'str', 'number', 'number', 'str',
'str', 'str', 'number', 'number', 'number',
'number', 'number',
'number', 'number', 'number', 'number',
'number', 'number', 'number', 'number',
'number', 'number', 'number', 'number',
'number', 'number', 'number',
'number']
GIT_URL = "https://raw.githubusercontent.com/embedding-benchmark/ebr/refs/heads/main/results/"
DATASET_URL = f"{GIT_URL}datasets.json"
MODEL_URL = f"{GIT_URL}models.json"
RESULT_URL = f"{GIT_URL}results.json"
class DataEngine:
def __init__(self):
self.df = self.init_dataframe()
@property
@cache_dict_with_custom_key("models")
def models(self):
"""
Get models data
"""
res = get(MODEL_URL)
if res.status_code == 200:
return res.json()
return {}
@property
@cache_dict_with_custom_key("datasets")
def datasets(self):
"""
Get tasks data
"""
res = get(DATASET_URL)
if res.status_code == 200:
return res.json()
return {}
@property
@cache_dict_with_custom_key("results")
def results(self):
"""
Get results data
"""
res = get(RESULT_URL)
if res.status_code == 200:
return res.json()
return {}
def init_dataframe(self):
"""
Initialize DataFrame
"""
return self.jsons_to_df()
def get_data(self):
"""
Get the full dataset
"""
df = self.df.copy()
# 移除指定列
columns_to_remove = ['group_name', 'leaderboard', 'dataset_name']
df = df.drop(columns=columns_to_remove)
# 按 NDCG@10 降序排序
return df.sort_values(by='ndcg_at_10', ascending=False)
def get_filtered_data(self, navigation=None, embd_type=None, embd_dims=None, similarity=None):
"""
Get filtered dataset based on criteria
"""
filtered_df = self.df.copy()
if navigation and navigation != "all":
filtered_df = filtered_df[filtered_df['leaderboard'] == navigation]
if embd_type and embd_type != "all":
filtered_df = filtered_df[filtered_df['embd_dtype'] == embd_type]
if similarity and similarity != "all":
filtered_df = filtered_df[filtered_df['similarity'] == similarity]
if embd_dims and isinstance(embd_dims, list) and len(embd_dims) > 0:
filtered_df = filtered_df[filtered_df['embd_dim'].isin(embd_dims)]
# 移除指定列
columns_to_remove = ['group_name', 'leaderboard', 'dataset_name']
filtered_df = filtered_df.drop(columns=columns_to_remove)
# 按 NDCG@10 降序排序
return filtered_df.sort_values(by='ndcg_at_10', ascending=False)
def _check_providers(self, organization: str, providers: List):
if not providers:
return True
if "Others" in providers:
if organization not in (
ModelProvider.OPENAI.value, ModelProvider.COHERE.value, ModelProvider.VOYAGEAI.value):
return True
return organization in providers
@cache_df_with_custom_key("json_result")
def jsons_to_df(self):
results_list = self.results
df_results_list = []
for result_dict in results_list:
dataset_name = result_dict["dataset_name"]
df_result_row = pd.DataFrame(result_dict["results"])
df_result_row["dataset_name"] = dataset_name
df_results_list.append(df_result_row)
df_result = pd.concat(df_results_list)
df_datasets_list = []
for item in self.datasets:
dataset_names = item["datasets"]
df_dataset_row = pd.DataFrame(
{
"group_name": [item["name"] for _ in range(len(dataset_names))],
"dataset_name": dataset_names,
"leaderboard": [item["leaderboard"] for _ in range(len(dataset_names))]
}
)
df_datasets_list.append(df_dataset_row)
df_dataset = pd.concat(df_datasets_list).drop_duplicates()
models_list = self.models
df_model = pd.DataFrame(models_list)
df = pd.merge(df_result, df_model, on=["model_name", "embd_dim", "embd_dtype"], how="inner")
df = pd.merge(df, df_dataset, on="dataset_name", how="inner")
df["model_name"] = df.apply(lambda
x: f"""<a target=\"_blank\" style=\"text-decoration: underline\" href=\"{x["reference"]}\">{x["model_name"]}</a>""",
axis=1)
if df.empty:
return pd.DataFrame(columns=COLUMNS)
return df[COLUMNS]
def filter_df(self, df_result: pd.DataFrame, embd_dtype: str, embd_dims: List, similarity: str, max_tokens: int):
"""
filter_by_providers
"""
if not embd_dims:
return df_result[0:0]
if embd_dtype and embd_dtype != "all":
df_result = df_result[df_result['embd_dtype'] == embd_dtype][:]
if similarity and similarity != "all":
df_result = df_result[df_result['similarity'] == similarity][:]
if max_tokens:
df_result = df_result[df_result['max_tokens'] >= max_tokens][:]
if embd_dims:
bins = [0, 1000, 2000, 5000, float('inf')]
labels = ['<=1k', '1k-2k', '2k-5k', '>=5k']
# 使用 pd.cut 进行分组
df_result['value_group'] = pd.cut(df_result['embd_dim'], bins=bins, labels=labels, right=False)
df_result = df_result[df_result['value_group'].isin(embd_dims)]
df_result = df_result[COLUMNS]
return df_result
def summarize_dataframe(self):
"""
Summarize data statistics
"""
|