File size: 1,484 Bytes
1040e55
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
# from .processors.builder import build_processors
from .xgpt3_dataset import MultiModalDataset
from mplug_owl_video.processing_mplug_owl import MplugOwlImageProcessor, MplugOwlProcessor

def train_valid_test_datasets_provider(data_path, config, tokenizer, seq_length=1024, loss_objective = 'sequential'):
    """Build train and valid datasets."""
    print('> building train and validation datasets for mPLUG-Owl ...')
    train_ds, valid_ds = build_train_valid_test_datasets(
        input_file=data_path,  
        tokenizer=tokenizer,
        max_length=seq_length, 
        config=config, loss_objective = loss_objective)  
    print("> finished creating mPLUG-Owl datasets ...")

    return train_ds, valid_ds


def build_train_valid_test_datasets(input_file, tokenizer, max_length=80, config=None):

    # train_processors = build_processors(config['train_processors'])
    # valid_processors = build_processors(config['valid_processors'])
    
    image_processor = MplugOwlImageProcessor.from_pretrained(config['pretrained_ckpt'])
    processor = MplugOwlProcessor(image_processor, tokenizer)

    assert len(input_file) == 2 # If you have files more than 2, modify code at here or merger them into train and dev
    train_ds = MultiModalDataset(input_file[0], tokenizer, processor, max_length, loss_objective = loss_objective)
    valid_ds = MultiModalDataset(input_file[1], tokenizer, processor, max_length, loss_objective = loss_objective)
    return (train_ds, valid_ds)