Spaces:
Running
Running
from typing import Dict | |
from huggingface_hub import get_collection | |
def get_datasets_nickname() -> Dict: | |
datasets_nickname = {} | |
collection = get_collection("vidore/vidore-benchmark-667173f98e70a1c0fa4db00d") | |
collection_items = collection.items | |
for item in collection_items: | |
dataset_name = item.item_id | |
if 'arxivqa' in dataset_name: | |
datasets_nickname[dataset_name] = 'ArxivQA' | |
datasets_nickname[dataset_name + '_ocr_chunk'] = 'ArxivQA' | |
datasets_nickname[dataset_name + '_captioning'] = 'ArxivQA' | |
elif 'docvqa' in dataset_name: | |
datasets_nickname[dataset_name] = 'DocVQA' | |
datasets_nickname[dataset_name + '_ocr_chunk'] = 'DocVQA' | |
datasets_nickname[dataset_name + '_captioning'] = 'DocVQA' | |
elif 'infovqa' in dataset_name: | |
datasets_nickname[dataset_name] = 'InfoVQA' | |
datasets_nickname[dataset_name + '_ocr_chunk'] = 'InfoVQA' | |
datasets_nickname[dataset_name + '_captioning'] = 'InfoVQA' | |
elif 'tabfquad' in dataset_name: | |
datasets_nickname[dataset_name] = 'TabFQuad' | |
datasets_nickname[dataset_name + '_ocr_chunk'] = 'TabFQuad' | |
datasets_nickname[dataset_name + '_captioning'] = 'TabFQuad' | |
elif 'tatdqa' in dataset_name: | |
datasets_nickname[dataset_name] = 'TATDQA' | |
datasets_nickname[dataset_name + '_ocr_chunk'] = 'TATDQA' | |
datasets_nickname[dataset_name + '_captioning'] = 'TATDQA' | |
elif 'shiftproject' in dataset_name: | |
datasets_nickname[dataset_name] = 'ShiftProject' | |
datasets_nickname[dataset_name + '_ocr_chunk'] = 'ShiftProject' | |
datasets_nickname[dataset_name + '_captioning'] = 'ShiftProject' | |
elif 'artificial_intelligence' in dataset_name: | |
datasets_nickname[dataset_name] = 'Artificial Intelligence' | |
datasets_nickname[dataset_name + '_ocr_chunk'] = 'Artificial Intelligence' | |
datasets_nickname[dataset_name + '_captioning'] = 'Artificial Intelligence' | |
elif 'energy' in dataset_name: | |
datasets_nickname[dataset_name] = 'Energy' | |
datasets_nickname[dataset_name + '_ocr_chunk'] = 'Energy' | |
datasets_nickname[dataset_name + '_captioning'] = 'Energy' | |
elif 'government_reports' in dataset_name: | |
datasets_nickname[dataset_name] = 'Government Reports' | |
datasets_nickname[dataset_name + '_ocr_chunk'] = 'Government Reports' | |
datasets_nickname[dataset_name + '_captioning'] = 'Government Reports' | |
elif 'healthcare' in dataset_name: | |
datasets_nickname[dataset_name] = 'Healthcare' | |
datasets_nickname[dataset_name + '_ocr_chunk'] = 'Healthcare' | |
datasets_nickname[dataset_name + '_captioning'] = 'Healthcare' | |
return datasets_nickname | |