Hugues Sibille commited on
Commit
228207a
1 Parent(s): fbaa735

feat: update leaderboard with .json from HF

Browse files
Files changed (1) hide show
  1. app.py +107 -20
app.py CHANGED
@@ -3,13 +3,87 @@ import os
3
 
4
  import gradio as gr
5
  import pandas as pd
6
- from huggingface_hub import HfApi, hf_hub_download
7
  from huggingface_hub.repocard import metadata_load
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
 
10
  def make_clickable_model(model_name, link=None):
 
11
  if link is None:
12
- link = "https://huggingface.co/" + model_name
 
 
 
 
 
 
 
 
13
  # Remove user from model name
14
  # return (
15
  # f'<a target="_blank" style="text-decoration: underline" href="{link}">{model_name.split("/")[-1]}</a>'
@@ -47,40 +121,53 @@ def get_vidore_data():
47
 
48
  # local cache path
49
  model_infos_path = "model_infos.json"
 
 
 
50
  MODEL_INFOS = {}
51
  if os.path.exists(model_infos_path):
52
  with open(model_infos_path) as f:
53
  MODEL_INFOS = json.load(f)
54
 
55
  models = api.list_models(filter="vidore")
56
-
57
- for model in models:
58
- if model.modelId not in MODEL_INFOS:
59
- readme_path = hf_hub_download(model.modelId, filename="README.md")
60
- meta = metadata_load(readme_path)
61
- try:
62
- result_path = hf_hub_download(model.modelId, filename="results.json")
63
-
64
- with open(result_path) as f:
65
- results = json.load(f)
66
- # keep only ndcg_at_5
67
- for dataset in results:
68
- results[dataset] = {key: value for key, value in results[dataset].items() if "ndcg_at_5" in key}
69
-
70
- MODEL_INFOS[model.modelId] = {"metadata": meta, "results": results}
71
- except:
72
- continue
 
 
 
 
 
 
 
 
 
73
 
74
  model_res = {}
75
  df = None
76
  if len(MODEL_INFOS) > 0:
77
  for model in MODEL_INFOS.keys():
 
78
  res = MODEL_INFOS[model]["results"]
79
  dataset_res = {}
80
  for dataset in res.keys():
81
  if "validation_set" == dataset:
82
  continue
83
- dataset_res[dataset] = res[dataset]["ndcg_at_5"]
84
  model_res[model] = dataset_res
85
 
86
  df = pd.DataFrame(model_res).T
 
3
 
4
  import gradio as gr
5
  import pandas as pd
6
+ from huggingface_hub import HfApi, hf_hub_download, get_collection
7
  from huggingface_hub.repocard import metadata_load
8
+ from typing import Dict
9
+
10
+
11
+ def get_datasets_nickname() -> Dict:
12
+
13
+ datasets_nickname = {}
14
+
15
+ collection = get_collection("vidore/vidore-benchmark-667173f98e70a1c0fa4db00d")
16
+
17
+ collection_items = collection.items
18
+
19
+ for item in collection_items:
20
+ dataset_name = item.item_id
21
+
22
+ if 'arxivqa' in dataset_name:
23
+ datasets_nickname[dataset_name] = 'ArxivQA'
24
+ datasets_nickname[dataset_name + '_ocr_chunk'] = 'ArxivQA'
25
+ datasets_nickname[dataset_name + '_captioning'] = 'ArxivQA'
26
+
27
+ elif 'docvqa' in dataset_name:
28
+ datasets_nickname[dataset_name] = 'DocVQA'
29
+ datasets_nickname[dataset_name + '_ocr_chunk'] = 'DocVQA'
30
+ datasets_nickname[dataset_name + '_captioning'] = 'DocVQA'
31
+
32
+ elif 'infovqa' in dataset_name:
33
+ datasets_nickname[dataset_name] = 'InfoVQA'
34
+ datasets_nickname[dataset_name + '_ocr_chunk'] = 'InfoVQA'
35
+ datasets_nickname[dataset_name + '_captioning'] = 'InfoVQA'
36
+
37
+ elif 'tabfquad' in dataset_name:
38
+ datasets_nickname[dataset_name] = 'TabFQuad'
39
+ datasets_nickname[dataset_name + '_ocr_chunk'] = 'TabFQuad'
40
+ datasets_nickname[dataset_name + '_captioning'] = 'TabFQuad'
41
+
42
+ elif 'tatdqa' in dataset_name:
43
+ datasets_nickname[dataset_name] = 'TATDQA'
44
+ datasets_nickname[dataset_name + '_ocr_chunk'] = 'TATDQA'
45
+ datasets_nickname[dataset_name + '_captioning'] = 'TATDQA'
46
+
47
+ elif 'shiftproject' in dataset_name:
48
+ datasets_nickname[dataset_name] = 'ShiftProject'
49
+ datasets_nickname[dataset_name + '_ocr_chunk'] = 'ShiftProject'
50
+ datasets_nickname[dataset_name + '_captioning'] = 'ShiftProject'
51
+
52
+ elif 'artificial_intelligence' in dataset_name:
53
+ datasets_nickname[dataset_name] = 'Artificial Intelligence'
54
+ datasets_nickname[dataset_name + '_ocr_chunk'] = 'Artificial Intelligence'
55
+ datasets_nickname[dataset_name + '_captioning'] = 'Artificial Intelligence'
56
+
57
+ elif 'energy' in dataset_name:
58
+ datasets_nickname[dataset_name] = 'Energy'
59
+ datasets_nickname[dataset_name + '_ocr_chunk'] = 'Energy'
60
+ datasets_nickname[dataset_name + '_captioning'] = 'Energy'
61
+
62
+ elif 'government_reports' in dataset_name:
63
+ datasets_nickname[dataset_name] = 'Government Reports'
64
+ datasets_nickname[dataset_name + '_ocr_chunk'] = 'Government Reports'
65
+ datasets_nickname[dataset_name + '_captioning'] = 'Government Reports'
66
+
67
+ elif 'healthcare' in dataset_name:
68
+ datasets_nickname[dataset_name] = 'Healthcare'
69
+ datasets_nickname[dataset_name + '_ocr_chunk'] = 'Healthcare'
70
+ datasets_nickname[dataset_name + '_captioning'] = 'Healthcare'
71
+
72
+ return datasets_nickname
73
 
74
 
75
  def make_clickable_model(model_name, link=None):
76
+
77
  if link is None:
78
+ desanitized_model_name = model_name.replace("_", "/")
79
+
80
+ if '/captioning' in desanitized_model_name:
81
+ desanitized_model_name = desanitized_model_name.replace('/captioning', '')
82
+ if '/ocr' in desanitized_model_name:
83
+ desanitized_model_name = desanitized_model_name.replace('/ocr', '')
84
+
85
+ link = "https://huggingface.co/" + desanitized_model_name
86
+
87
  # Remove user from model name
88
  # return (
89
  # f'<a target="_blank" style="text-decoration: underline" href="{link}">{model_name.split("/")[-1]}</a>'
 
121
 
122
  # local cache path
123
  model_infos_path = "model_infos.json"
124
+ metric = "ndcg_at_5"
125
+
126
+
127
  MODEL_INFOS = {}
128
  if os.path.exists(model_infos_path):
129
  with open(model_infos_path) as f:
130
  MODEL_INFOS = json.load(f)
131
 
132
  models = api.list_models(filter="vidore")
133
+ repositories = [model.modelId for model in models]
134
+
135
+ datasets_nickname = get_datasets_nickname()
136
+ for repo_id in repositories:
137
+ files = [f for f in api.list_repo_files(repo_id) if f.endswith('_metrics.json')]
138
+ if len(files) == 0:
139
+ continue
140
+ else :
141
+ for file in files:
142
+ model_name = file.split('_metrics.json')[0]
143
+
144
+ if model_name not in MODEL_INFOS:
145
+ readme_path = hf_hub_download(repo_id, filename="README.md")
146
+ meta = metadata_load(readme_path)
147
+ try:
148
+ result_path = hf_hub_download(repo_id, filename= file)
149
+
150
+ with open(result_path) as f:
151
+ results = json.load(f)
152
+ # keep only ndcg_at_5
153
+ for dataset in results:
154
+ results[dataset] = {key: value for key, value in results[dataset].items() if metric in key}
155
+
156
+ MODEL_INFOS[model_name] = {"meta":meta, "results": results}
157
+ except:
158
+ continue
159
 
160
  model_res = {}
161
  df = None
162
  if len(MODEL_INFOS) > 0:
163
  for model in MODEL_INFOS.keys():
164
+ print(model)
165
  res = MODEL_INFOS[model]["results"]
166
  dataset_res = {}
167
  for dataset in res.keys():
168
  if "validation_set" == dataset:
169
  continue
170
+ dataset_res[datasets_nickname[dataset]] = res[dataset][metric]
171
  model_res[model] = dataset_res
172
 
173
  df = pd.DataFrame(model_res).T