Spaces:
Running
Running
File size: 4,371 Bytes
19d93fe 4c1e130 19d93fe 4c1e130 0858809 19d93fe 9331159 4c1e130 9331159 19d93fe ebac224 19d93fe ebac224 4c1e130 19d93fe 4c1e130 19d93fe ebac224 19d93fe ebac224 19d93fe 4c1e130 ebac224 4c1e130 19d93fe 4c1e130 ebac224 4c1e130 1045c52 19d93fe 4c1e130 9331159 4c1e130 9331159 4c1e130 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 |
from git import Repo
import shutil
import os
import json
import pandas as pd
from .dataset_handler import VIDORE_V1_MTEB_NAMES, VIDORE_V2_MTEB_NAMES, get_datasets_nickname
class ModelHandler:
def __init__(self):
self.model_infos = {}
@staticmethod
def get_folders(dir_path):
return sorted([
path_
for path_ in os.listdir(dir_path)
if os.path.isdir(os.path.join(dir_path, path_))
])
def get_vidore_data(self, metric="ndcg_at_5"):
repo_url = "https://github.com/embeddings-benchmark/results.git"
local_path = "./results"
folder_of_interest = "results"
if os.path.exists(local_path):
repo = Repo(local_path)
origin = repo.remotes.origin
origin.pull()
else:
Repo.clone_from(repo_url, local_path, depth=1)
model_names = self.get_folders(os.path.join(local_path, folder_of_interest))
for model_name in model_names:
revisions = self.get_folders(os.path.join(local_path, folder_of_interest, model_name))
first_revision = revisions[0]
result_filenames = [
result_filename
for result_filename in os.listdir(os.path.join(local_path, folder_of_interest, model_name, first_revision))
# if result_filename.endswith(".json") and result_filename != "model_meta.json"
]
if "model_meta.json" in result_filenames:
with open(os.path.join(local_path, folder_of_interest, model_name, first_revision, "model_meta.json"), "r") as f:
meta = json.load(f)
else:
meta = {}
results = {}
if all(f"{v1_dataset_name}.json" in result_filenames for v1_dataset_name in VIDORE_V1_MTEB_NAMES):
for v1_dataset_name in VIDORE_V1_MTEB_NAMES:
with open(os.path.join(local_path, folder_of_interest, model_name, first_revision, f"{v1_dataset_name}.json"), "r") as f:
results[v1_dataset_name] = json.load(f)
if all(f"{v2_dataset_name}.json" in result_filenames for v2_dataset_name in VIDORE_V2_MTEB_NAMES):
for v2_dataset_name in VIDORE_V2_MTEB_NAMES:
with open(os.path.join(local_path, folder_of_interest, model_name, first_revision, f"{v2_dataset_name}.json"), "r") as f:
results[v2_dataset_name] = json.load(f)
if model_name not in self.model_infos:
self.model_infos[model_name] = {}
self.model_infos[model_name] = {"meta": meta, "results": results}
def filter_models_by_benchmark(self, benchmark_version=1):
filtered_model_infos = {}
keywords = VIDORE_V1_MTEB_NAMES if benchmark_version == 1 else VIDORE_V2_MTEB_NAMES
for model, info in self.model_infos.items():
results = info["results"]
if any(any(keyword in dataset for keyword in keywords) for dataset in results.keys()):
filtered_model_infos[model] = info
return filtered_model_infos
def render_df(self, metric="ndcg_at_5", benchmark_version=1):
model_res = {}
filtered_model_infos = self.filter_models_by_benchmark(benchmark_version)
if len(filtered_model_infos) > 0:
for model in filtered_model_infos.keys():
res = filtered_model_infos[model]["results"]
dataset_res = {}
keywords = VIDORE_V1_MTEB_NAMES if benchmark_version == 1 else VIDORE_V2_MTEB_NAMES
if "n_parameters" in filtered_model_infos[model]["meta"]:
dataset_res["Model Size (Million Parameters)"] = filtered_model_infos[model]["meta"]["n_parameters"] // 1_000_000
else:
dataset_res["Model Size (Million Parameters)"] = None
for dataset in res.keys():
if not any(keyword in dataset for keyword in keywords):
continue
dataset_nickname = get_datasets_nickname(dataset)
dataset_res[dataset_nickname] = res[dataset]["scores"]["test"][0][metric]
model_res[model] = dataset_res
df = pd.DataFrame(model_res).T
return df
return pd.DataFrame()
|