""" Data service provider """ import json from typing import List import pandas as pd from app.backend.constant import ModelProvider class DataEngine: def __init__(self): self.df = self.init_dataframe() @property def leaderboards(self): """ Get leaderboard data """ with open('./mock_data/leaderboard.json', 'r') as f: return json.load(f) @property def models(self): """ Get models data """ with open('./mock_data/models.json', 'r') as f: return json.load(f) @property def tasks(self): """ Get tasks data """ with open('./mock_data/tasks.json', 'r') as f: return json.load(f) @property def results(self): """ Get results data """ with open('./mock_data/results.json', 'r') as f: return json.load(f) def init_dataframe(self): """ Initialize DataFrame """ d = {"hello": [123], "world": [456]} return pd.DataFrame(d) def _check_providers(self, organization: str, providers: List): if not providers: return True if "Others" in providers: if organization not in ( ModelProvider.OPENAI.value, ModelProvider.COHERE.value, ModelProvider.VOYAGEAI.value): return True return organization in providers def filter_df(self, leaderboard: str, task: str, providers: List, sort_key: str): tasks = [] for lb in self.leaderboards: if lb["name"] == leaderboard: tasks = lb["tasks"] break df_list = [] for t in (filter(lambda x: x.upper() == task.upper(), tasks)): datasets = [] for ta in self.tasks: if ta["slug"].upper() == t.upper(): datasets = ta["datasets"] break for model in self.models: if t in model["tasks"] and self._check_providers(model["organization"], providers): for dataset in datasets: results = self.results[dataset] for result in results: if result['model_name'] == model["model_name"]: d = result["results"] d["class"] = result["class"] d["organization"] = result["organization"] d["model_name"] = result["model_name"] df = pd.DataFrame([d]) df = df[["class", "organization", "model_name", "ndcg_at_1", "ndcg_at_3", "ndcg_at_5", "ndcg_at_10", "ndcg_at_20", "ndcg_at_50", "ndcg_at_100", "recall_at_1", "recall_at_3", "recall_at_5", "recall_at_10", "recall_at_20", "recall_at_50", "recall_at_100", "precision_at_1", "precision_at_3", "precision_at_5", "precision_at_10", "precision_at_20", "precision_at_50", "precision_at_100"]] df_list.append(df) if df_list: return pd.concat(df_list).sort_values(by=sort_key.replace("@", '_at_').lower()) return pd.DataFrame(columns=["class", "organization", "model_name", "ndcg_at_1", "ndcg_at_3", "ndcg_at_5", "ndcg_at_10", "ndcg_at_20", "ndcg_at_50", "ndcg_at_100", "recall_at_1", "recall_at_3", "recall_at_5", "recall_at_10", "recall_at_20", "recall_at_50", "recall_at_100", "precision_at_1", "precision_at_3", "precision_at_5", "precision_at_10", "precision_at_20", "precision_at_50", "precision_at_100"]) def get_model_result(self, model: dict, task_datasets_map: dict, results: dict): """ get_model_result """ model_class = model["class"] model_organization = model["organization"] model_model_name = model["model_name"] for leaderboard in model["leaderboards"]: for task in model["tasks"]: for dateset in task_datasets_map.get(task, []): for result in results[dateset]: if result["model_name"] == model_model_name: d_result = result["results"] d_result["class"] = model_class d_result["organization"] = model_organization d_result["model_name"] = model_model_name d_result["leaderboard"] = leaderboard d_result["dateset"] = dateset d_result["task"] = task yield d_result def jsons_to_df(self): # change leaderboards to task_leaderboard_map task_leaderboard_map = {} leaderboards = self.leaderboards for leaderboard in leaderboards: for task in leaderboard["tasks"]: task_leaderboard_map[task] = leaderboard["name"] # change tasks to task_datasets_map task_datasets_map = {} for task in self.tasks: task_datasets_map[task["slug"]] = task["datasets"] df_results_list = [] results = self.results for model in self.models: for d_result in self.get_model_result(model, task_datasets_map, results): if d_result: df_results_list.append(pd.DataFrame([d_result])) if df_results_list: df_result = pd.concat(df_results_list) return df_result[ ["leaderboard", "task", "class", "organization", "model_name", "dateset", "ndcg_at_1", "ndcg_at_3", "ndcg_at_5", "ndcg_at_10", "ndcg_at_20", "ndcg_at_50", "ndcg_at_100", "recall_at_1", "recall_at_3", "recall_at_5", "recall_at_10", "recall_at_20", "recall_at_50", "recall_at_100", "precision_at_1", "precision_at_3", "precision_at_5", "precision_at_10", "precision_at_20", "precision_at_50", "precision_at_100"]], leaderboards return pd.DataFrame( columns=["leaderboard", "task", "class", "organization", "model_name", "dateset", "ndcg_at_1", "ndcg_at_3", "ndcg_at_5", "ndcg_at_10", "ndcg_at_20", "ndcg_at_50", "ndcg_at_100", "recall_at_1", "recall_at_3", "recall_at_5", "recall_at_10", "recall_at_20", "recall_at_50", "recall_at_100", "precision_at_1", "precision_at_3", "precision_at_5", "precision_at_10", "precision_at_20", "precision_at_50", "precision_at_100"]), leaderboards def filter_by_providers(self, df_result: pd.DataFrame, providers: List): """ filter_by_providers """ if not providers: # providers are empty, return empty return df_result[0:0] return df_result[df_result['organization'].apply(lambda x: self._check_providers(x, providers))] def summarize_dataframe(self): """ Summarize data statistics """