vidore-leaderboard / data /dataset_handler.py
Hugues Sibille
refactor : break app.py in different files
187990b
raw
history blame
2.89 kB
from typing import Dict
from huggingface_hub import get_collection
def get_datasets_nickname() -> Dict:
datasets_nickname = {}
collection = get_collection("vidore/vidore-benchmark-667173f98e70a1c0fa4db00d")
collection_items = collection.items
for item in collection_items:
dataset_name = item.item_id
if 'arxivqa' in dataset_name:
datasets_nickname[dataset_name] = 'ArxivQA'
datasets_nickname[dataset_name + '_ocr_chunk'] = 'ArxivQA'
datasets_nickname[dataset_name + '_captioning'] = 'ArxivQA'
elif 'docvqa' in dataset_name:
datasets_nickname[dataset_name] = 'DocVQA'
datasets_nickname[dataset_name + '_ocr_chunk'] = 'DocVQA'
datasets_nickname[dataset_name + '_captioning'] = 'DocVQA'
elif 'infovqa' in dataset_name:
datasets_nickname[dataset_name] = 'InfoVQA'
datasets_nickname[dataset_name + '_ocr_chunk'] = 'InfoVQA'
datasets_nickname[dataset_name + '_captioning'] = 'InfoVQA'
elif 'tabfquad' in dataset_name:
datasets_nickname[dataset_name] = 'TabFQuad'
datasets_nickname[dataset_name + '_ocr_chunk'] = 'TabFQuad'
datasets_nickname[dataset_name + '_captioning'] = 'TabFQuad'
elif 'tatdqa' in dataset_name:
datasets_nickname[dataset_name] = 'TATDQA'
datasets_nickname[dataset_name + '_ocr_chunk'] = 'TATDQA'
datasets_nickname[dataset_name + '_captioning'] = 'TATDQA'
elif 'shiftproject' in dataset_name:
datasets_nickname[dataset_name] = 'ShiftProject'
datasets_nickname[dataset_name + '_ocr_chunk'] = 'ShiftProject'
datasets_nickname[dataset_name + '_captioning'] = 'ShiftProject'
elif 'artificial_intelligence' in dataset_name:
datasets_nickname[dataset_name] = 'Artificial Intelligence'
datasets_nickname[dataset_name + '_ocr_chunk'] = 'Artificial Intelligence'
datasets_nickname[dataset_name + '_captioning'] = 'Artificial Intelligence'
elif 'energy' in dataset_name:
datasets_nickname[dataset_name] = 'Energy'
datasets_nickname[dataset_name + '_ocr_chunk'] = 'Energy'
datasets_nickname[dataset_name + '_captioning'] = 'Energy'
elif 'government_reports' in dataset_name:
datasets_nickname[dataset_name] = 'Government Reports'
datasets_nickname[dataset_name + '_ocr_chunk'] = 'Government Reports'
datasets_nickname[dataset_name + '_captioning'] = 'Government Reports'
elif 'healthcare' in dataset_name:
datasets_nickname[dataset_name] = 'Healthcare'
datasets_nickname[dataset_name + '_ocr_chunk'] = 'Healthcare'
datasets_nickname[dataset_name + '_captioning'] = 'Healthcare'
return datasets_nickname