File size: 6,306 Bytes
4c1e130
 
9331159
 
4c1e130
9331159
 
ebac224
4c1e130
0858809
 
9331159
4c1e130
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9331159
 
 
ebac224
 
 
 
 
 
 
 
 
 
 
 
 
4c1e130
 
 
 
ebac224
 
 
4c1e130
9331159
0858809
 
9331159
 
4c1e130
 
 
 
9331159
ebac224
1045c52
9331159
ebac224
4c1e130
ebac224
4c1e130
ebac224
 
 
 
4c1e130
ebac224
 
4c1e130
ebac224
 
 
4c1e130
ebac224
 
 
 
4c1e130
ebac224
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4c1e130
ebac224
 
 
 
4c1e130
ebac224
4c1e130
ebac224
4c1e130
1045c52
 
 
4c1e130
9331159
4c1e130
9331159
4c1e130
 
 
 
ebac224
60415a5
4c1e130
 
 
 
 
 
 
 
 
 
 
 
ebac224
4c1e130
 
 
 
 
 
9331159
1045c52
 
 
4c1e130
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
import json
import os
from typing import Any, Dict

import pandas as pd
from huggingface_hub import HfApi, hf_hub_download, metadata_load

from .dataset_handler import VIDORE_2_DATASETS_KEYWORDS, VIDORE_DATASETS_KEYWORDS, get_datasets_nickname

BLOCKLIST = ["impactframes"]


class ModelHandler:
    def __init__(self, model_infos_path="model_infos.json"):
        self.api = HfApi()
        self.model_infos_path = model_infos_path
        self.model_infos = self._load_model_infos()

    def _load_model_infos(self) -> Dict:
        if os.path.exists(self.model_infos_path):
            with open(self.model_infos_path) as f:
                return json.load(f)
        return {}

    def _save_model_infos(self):
        with open(self.model_infos_path, "w") as f:
            json.dump(self.model_infos, f)

    def _are_results_in_new_vidore_format(self, results: Dict[str, Any]) -> bool:
        return "metadata" in results and "metrics" in results

    def _is_baseline_repo(self, repo_id: str) -> bool:
        return repo_id == "vidore/baseline-results"

    def sanitize_model_name(self, model_name):
        return model_name.replace("/", "_").replace(".", "-thisisapoint-")

    def fuze_model_infos(self, model_name, results):
        for dataset, metrics in results.items():
            if dataset not in self.model_infos[model_name]["results"].keys():
                self.model_infos[model_name]["results"][dataset] = metrics
            else:
                continue

    def get_vidore_data(self, metric="ndcg_at_5"):
        models = self.api.list_models(filter="vidore")
        repositories = [model.modelId for model in models]  # type: ignore

        # Sort repositories to process non-baseline repos first (to prioritize their results)
        repositories.sort(key=lambda x: self._is_baseline_repo(x))

        for repo_id in repositories:
            org_name = repo_id.split("/")[0]
            if org_name in BLOCKLIST:
                continue
            files = [f for f in self.api.list_repo_files(repo_id) if f.endswith("_metrics.json") or f == "results.json"]

            if len(files) == 0:
                continue
            else:
                for file in files:
                    if file.endswith("results.json"):
                        model_name = repo_id.replace("/", "_").replace(".", "-thisisapoint-")
                    else:
                        model_name = file.split("_metrics.json")[0]
                        model_name = model_name.replace("/", "_").replace(".", "-thisisapoint-")

                    # Skip if the model is from baseline and we already have results

                    readme_path = hf_hub_download(repo_id, filename="README.md")
                    meta = metadata_load(readme_path)
                    try:
                        result_path = hf_hub_download(repo_id, filename=file)

                        with open(result_path) as f:
                            results = json.load(f)

                        if self._are_results_in_new_vidore_format(results):
                            metadata = results["metadata"]
                            results = results["metrics"]

                        # Handles the case where the model is both in baseline and outside of it
                        # (prioritizes the non-baseline results)
                        if self._is_baseline_repo(repo_id) and self.sanitize_model_name(model_name) in self.model_infos:
                            self.fuze_model_infos(model_name, results)

                        self.model_infos[model_name] = {"meta": meta, "results": results}
                    except Exception as e:
                        print(f"Error loading {model_name} - {e}")
                        continue

    # In order to keep only models relevant to a benchmark
    def filter_models_by_benchmark(self, benchmark_version=1):
        filtered_model_infos = {}
        keywords = VIDORE_DATASETS_KEYWORDS if benchmark_version == 1 else VIDORE_2_DATASETS_KEYWORDS

        for model, info in self.model_infos.items():
            results = info["results"]
            if any(any(keyword in dataset for keyword in keywords) for dataset in results.keys()):
                filtered_model_infos[model] = info

        return filtered_model_infos

    # Compute the average of a metric for each model,
    def compute_averages(self, metric="ndcg_at_5", benchmark_version=1):
        model_res = {}
        filtered_model_infos = self.filter_models_by_benchmark(benchmark_version)
        if len(filtered_model_infos) > 0:
            for model in filtered_model_infos.keys():
                res = filtered_model_infos[model]["results"]
                dataset_res = {}
                keywords = VIDORE_DATASETS_KEYWORDS if benchmark_version == 1 else VIDORE_2_DATASETS_KEYWORDS
                for dataset in res.keys():
                    if not any(keyword in dataset for keyword in keywords):
                        continue

                    dataset_nickname = get_datasets_nickname(dataset)
                    dataset_res[dataset_nickname] = res[dataset][metric]
                model_res[model] = dataset_res

            df = pd.DataFrame(model_res).T

            return df
        return pd.DataFrame()

    @staticmethod
    def add_rank(df, benchmark_version=1):
        df.fillna(0.0, inplace=True)
        cols_to_rank = [
            col
            for col in df.columns
            if col
            not in [
                "Model",
                "Model Size (Million Parameters)",
                "Memory Usage (GB, fp32)",
                "Embedding Dimensions",
                "Max Tokens",
            ]
        ]

        if len(cols_to_rank) == 1:
            df.sort_values(cols_to_rank[0], ascending=False, inplace=True)
        else:
            df.insert(len(df.columns) - len(cols_to_rank), "Average", df[cols_to_rank].mean(axis=1, skipna=False))
            df.sort_values("Average", ascending=False, inplace=True)
        df.insert(0, "Rank", list(range(1, len(df) + 1)))
        # multiply values by 100 if they are floats and round to 1 decimal place
        for col in df.columns:
            if df[col].dtype == "float64":
                df[col] = df[col].apply(lambda x: round(x * 100, 1))
        return df