import os import random from llava.datasets.builder import DATASETS from typing import Dict, Optional, Sequence, List from llava.datasets.data_cfgs import data_configs from llava.datasets.base_dataset import ImageTaskDataset from llava.datasets.prompts import cc_sbu_prompt from llava.constants import DEFAULT_IMAGE_TOKEN class CCSBUDataset(ImageTaskDataset): def __init__(self, anno_path, data_args=None, name='cc_sbu'): super().__init__(anno_path=anno_path, data_args=data_args, name=name) def text_preprocess(self, item) -> List[Dict[str, str]]: caption = item['caption'] conversations = [ { 'from': 'human', 'value': DEFAULT_IMAGE_TOKEN + random.choice(cc_sbu_prompt) }, { 'from': 'model', 'value': caption } ] return conversations @DATASETS.register_obj def cc_sbu(data_args): return CCSBUDataset(data_configs["cc_sbu"]['train_data_path'], data_args)