q275343119's picture
mod - Model Provides 全不选 ηš„ζ—Άε€™ζ— θΏ”ε›ž
9e296e4
raw
history blame
7.67 kB
"""
Data service provider
"""
import json
from typing import List
import pandas as pd
from app.backend.constant import ModelProvider
class DataEngine:
def __init__(self):
self.df = self.init_dataframe()
@property
def leaderboards(self):
"""
Get leaderboard data
"""
with open('./mock_data/leaderboard.json', 'r') as f:
return json.load(f)
@property
def models(self):
"""
Get models data
"""
with open('./mock_data/models.json', 'r') as f:
return json.load(f)
@property
def tasks(self):
"""
Get tasks data
"""
with open('./mock_data/tasks.json', 'r') as f:
return json.load(f)
@property
def results(self):
"""
Get results data
"""
with open('./mock_data/results.json', 'r') as f:
return json.load(f)
def init_dataframe(self):
"""
Initialize DataFrame
"""
d = {"hello": [123], "world": [456]}
return pd.DataFrame(d)
def _check_providers(self, organization: str, providers: List):
if not providers:
return True
if "Others" in providers:
if organization not in (
ModelProvider.OPENAI.value, ModelProvider.COHERE.value, ModelProvider.VOYAGEAI.value):
return True
return organization in providers
def filter_df(self, leaderboard: str, task: str, providers: List, sort_key: str):
tasks = []
for lb in self.leaderboards:
if lb["name"] == leaderboard:
tasks = lb["tasks"]
break
df_list = []
for t in (filter(lambda x: x.upper() == task.upper(), tasks)):
datasets = []
for ta in self.tasks:
if ta["slug"].upper() == t.upper():
datasets = ta["datasets"]
break
for model in self.models:
if t in model["tasks"] and self._check_providers(model["organization"], providers):
for dataset in datasets:
results = self.results[dataset]
for result in results:
if result['model_name'] == model["model_name"]:
d = result["results"]
d["class"] = result["class"]
d["organization"] = result["organization"]
d["model_name"] = result["model_name"]
df = pd.DataFrame([d])
df = df[["class", "organization", "model_name", "ndcg_at_1", "ndcg_at_3", "ndcg_at_5",
"ndcg_at_10",
"ndcg_at_20", "ndcg_at_50", "ndcg_at_100", "recall_at_1", "recall_at_3",
"recall_at_5", "recall_at_10",
"recall_at_20", "recall_at_50", "recall_at_100", "precision_at_1",
"precision_at_3", "precision_at_5",
"precision_at_10", "precision_at_20", "precision_at_50", "precision_at_100"]]
df_list.append(df)
if df_list:
return pd.concat(df_list).sort_values(by=sort_key.replace("@", '_at_').lower())
return pd.DataFrame(columns=["class", "organization", "model_name", "ndcg_at_1", "ndcg_at_3", "ndcg_at_5",
"ndcg_at_10",
"ndcg_at_20", "ndcg_at_50", "ndcg_at_100", "recall_at_1", "recall_at_3",
"recall_at_5", "recall_at_10",
"recall_at_20", "recall_at_50", "recall_at_100", "precision_at_1",
"precision_at_3", "precision_at_5",
"precision_at_10", "precision_at_20", "precision_at_50", "precision_at_100"])
def get_model_result(self, model: dict, task_datasets_map: dict, results: dict):
"""
get_model_result
"""
model_class = model["class"]
model_organization = model["organization"]
model_model_name = model["model_name"]
for leaderboard in model["leaderboards"]:
for task in model["tasks"]:
for dateset in task_datasets_map.get(task, []):
for result in results[dateset]:
if result["model_name"] == model_model_name:
d_result = result["results"]
d_result["class"] = model_class
d_result["organization"] = model_organization
d_result["model_name"] = model_model_name
d_result["leaderboard"] = leaderboard
d_result["dateset"] = dateset
d_result["task"] = task
yield d_result
def jsons_to_df(self):
# change leaderboards to task_leaderboard_map
task_leaderboard_map = {}
leaderboards = self.leaderboards
for leaderboard in leaderboards:
for task in leaderboard["tasks"]:
task_leaderboard_map[task] = leaderboard["name"]
# change tasks to task_datasets_map
task_datasets_map = {}
for task in self.tasks:
task_datasets_map[task["slug"]] = task["datasets"]
df_results_list = []
results = self.results
for model in self.models:
for d_result in self.get_model_result(model, task_datasets_map, results):
if d_result:
df_results_list.append(pd.DataFrame([d_result]))
if df_results_list:
df_result = pd.concat(df_results_list)
return df_result[
["leaderboard", "task", "class", "organization", "model_name", "dateset", "ndcg_at_1", "ndcg_at_3",
"ndcg_at_5",
"ndcg_at_10",
"ndcg_at_20", "ndcg_at_50", "ndcg_at_100", "recall_at_1", "recall_at_3",
"recall_at_5", "recall_at_10",
"recall_at_20", "recall_at_50", "recall_at_100", "precision_at_1",
"precision_at_3", "precision_at_5",
"precision_at_10", "precision_at_20", "precision_at_50", "precision_at_100"]], leaderboards
return pd.DataFrame(
columns=["leaderboard", "task", "class", "organization", "model_name", "dateset", "ndcg_at_1", "ndcg_at_3",
"ndcg_at_5",
"ndcg_at_10",
"ndcg_at_20", "ndcg_at_50", "ndcg_at_100", "recall_at_1", "recall_at_3",
"recall_at_5", "recall_at_10",
"recall_at_20", "recall_at_50", "recall_at_100", "precision_at_1",
"precision_at_3", "precision_at_5",
"precision_at_10", "precision_at_20", "precision_at_50", "precision_at_100"]), leaderboards
def filter_by_providers(self, df_result: pd.DataFrame, providers: List):
"""
filter_by_providers
"""
if not providers:
# providers are empty, return empty
return df_result[0:0]
return df_result[df_result['organization'].apply(lambda x: self._check_providers(x, providers))]
def summarize_dataframe(self):
"""
Summarize data statistics
"""