Spaces:
Running
Running
""" | |
Data service provider | |
""" | |
import json | |
from typing import List | |
import pandas as pd | |
from app.backend.constant import ModelProvider | |
class DataEngine: | |
def __init__(self): | |
self.df = self.init_dataframe() | |
def leaderboards(self): | |
""" | |
Get leaderboard data | |
""" | |
with open('./mock_data/leaderboard.json', 'r') as f: | |
return json.load(f) | |
def models(self): | |
""" | |
Get models data | |
""" | |
with open('./mock_data/models.json', 'r') as f: | |
return json.load(f) | |
def tasks(self): | |
""" | |
Get tasks data | |
""" | |
with open('./mock_data/tasks.json', 'r') as f: | |
return json.load(f) | |
def results(self): | |
""" | |
Get results data | |
""" | |
with open('./mock_data/results.json', 'r') as f: | |
return json.load(f) | |
def init_dataframe(self): | |
""" | |
Initialize DataFrame | |
""" | |
d = {"hello": [123], "world": [456]} | |
return pd.DataFrame(d) | |
def _check_providers(self, organization: str, providers: List): | |
if not providers: | |
return True | |
if "Others" in providers: | |
if organization not in ( | |
ModelProvider.OPENAI.value, ModelProvider.COHERE.value, ModelProvider.VOYAGEAI.value): | |
return True | |
return organization in providers | |
def filter_df(self, leaderboard: str, task: str, providers: List, sort_key: str): | |
tasks = [] | |
for lb in self.leaderboards: | |
if lb["name"] == leaderboard: | |
tasks = lb["tasks"] | |
break | |
df_list = [] | |
for t in (filter(lambda x: x.upper() == task.upper(), tasks)): | |
datasets = [] | |
for ta in self.tasks: | |
if ta["slug"].upper() == t.upper(): | |
datasets = ta["datasets"] | |
break | |
for model in self.models: | |
if t in model["tasks"] and self._check_providers(model["organization"], providers): | |
for dataset in datasets: | |
results = self.results[dataset] | |
for result in results: | |
if result['model_name'] == model["model_name"]: | |
d = result["results"] | |
d["class"] = result["class"] | |
d["organization"] = result["organization"] | |
d["model_name"] = result["model_name"] | |
df = pd.DataFrame([d]) | |
df = df[["class", "organization", "model_name", "ndcg_at_1", "ndcg_at_3", "ndcg_at_5", | |
"ndcg_at_10", | |
"ndcg_at_20", "ndcg_at_50", "ndcg_at_100", "recall_at_1", "recall_at_3", | |
"recall_at_5", "recall_at_10", | |
"recall_at_20", "recall_at_50", "recall_at_100", "precision_at_1", | |
"precision_at_3", "precision_at_5", | |
"precision_at_10", "precision_at_20", "precision_at_50", "precision_at_100"]] | |
df_list.append(df) | |
if df_list: | |
return pd.concat(df_list).sort_values(by=sort_key.replace("@", '_at_').lower()) | |
return pd.DataFrame(columns=["class", "organization", "model_name", "ndcg_at_1", "ndcg_at_3", "ndcg_at_5", | |
"ndcg_at_10", | |
"ndcg_at_20", "ndcg_at_50", "ndcg_at_100", "recall_at_1", "recall_at_3", | |
"recall_at_5", "recall_at_10", | |
"recall_at_20", "recall_at_50", "recall_at_100", "precision_at_1", | |
"precision_at_3", "precision_at_5", | |
"precision_at_10", "precision_at_20", "precision_at_50", "precision_at_100"]) | |
def get_model_result(self, model: dict, task_datasets_map: dict, results: dict): | |
""" | |
get_model_result | |
""" | |
model_class = model["class"] | |
model_organization = model["organization"] | |
model_model_name = model["model_name"] | |
for leaderboard in model["leaderboards"]: | |
for task in model["tasks"]: | |
for dateset in task_datasets_map.get(task, []): | |
for result in results[dateset]: | |
if result["model_name"] == model_model_name: | |
d_result = result["results"] | |
d_result["class"] = model_class | |
d_result["organization"] = model_organization | |
d_result["model_name"] = model_model_name | |
d_result["leaderboard"] = leaderboard | |
d_result["dateset"] = dateset | |
d_result["task"] = task | |
yield d_result | |
def jsons_to_df(self): | |
# change leaderboards to task_leaderboard_map | |
task_leaderboard_map = {} | |
leaderboards = self.leaderboards | |
for leaderboard in leaderboards: | |
for task in leaderboard["tasks"]: | |
task_leaderboard_map[task] = leaderboard["name"] | |
# change tasks to task_datasets_map | |
task_datasets_map = {} | |
for task in self.tasks: | |
task_datasets_map[task["slug"]] = task["datasets"] | |
df_results_list = [] | |
results = self.results | |
for model in self.models: | |
for d_result in self.get_model_result(model, task_datasets_map, results): | |
if d_result: | |
df_results_list.append(pd.DataFrame([d_result])) | |
if df_results_list: | |
df_result = pd.concat(df_results_list) | |
return df_result[ | |
["leaderboard", "task", "class", "organization", "model_name", "dateset", "ndcg_at_1", "ndcg_at_3", | |
"ndcg_at_5", | |
"ndcg_at_10", | |
"ndcg_at_20", "ndcg_at_50", "ndcg_at_100", "recall_at_1", "recall_at_3", | |
"recall_at_5", "recall_at_10", | |
"recall_at_20", "recall_at_50", "recall_at_100", "precision_at_1", | |
"precision_at_3", "precision_at_5", | |
"precision_at_10", "precision_at_20", "precision_at_50", "precision_at_100"]], leaderboards | |
return pd.DataFrame( | |
columns=["leaderboard", "task", "class", "organization", "model_name", "dateset", "ndcg_at_1", "ndcg_at_3", | |
"ndcg_at_5", | |
"ndcg_at_10", | |
"ndcg_at_20", "ndcg_at_50", "ndcg_at_100", "recall_at_1", "recall_at_3", | |
"recall_at_5", "recall_at_10", | |
"recall_at_20", "recall_at_50", "recall_at_100", "precision_at_1", | |
"precision_at_3", "precision_at_5", | |
"precision_at_10", "precision_at_20", "precision_at_50", "precision_at_100"]), leaderboards | |
def filter_by_providers(self, df_result: pd.DataFrame, providers: List): | |
""" | |
filter_by_providers | |
""" | |
if not providers: | |
# providers are empty, return empty | |
return df_result[0:0] | |
return df_result[df_result['organization'].apply(lambda x: self._check_providers(x, providers))] | |
def summarize_dataframe(self): | |
""" | |
Summarize data statistics | |
""" | |