Spaces:
Runtime error
Runtime error
File size: 2,413 Bytes
476ac07 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 |
# Copyright (c) OpenMMLab. All rights reserved.
from functools import partial
from torch.utils.data import ConcatDataset
from xtuner.dataset import MOSSSFTDataset
from xtuner.dataset.collate_fns import default_collate_fn
def moss_003_sft_dataset(tokenizer,
plugins_data_file=None,
no_plugins_data_file=None,
bot_name=None,
max_length=2048):
plugins = moss_003_sft_plugins_dataset(
tokenizer,
data_file=plugins_data_file,
bot_name=bot_name,
max_length=max_length)
no_plugins = moss_003_sft_no_plugins_dataset(
tokenizer,
data_file=no_plugins_data_file,
bot_name=bot_name,
max_length=max_length)
dataset = ConcatDataset([plugins, no_plugins])
return dataset
def moss_003_sft_data_collator(return_hf_format=False):
return partial(default_collate_fn, return_hf_format=return_hf_format)
def moss_003_sft_no_plugins_dataset(tokenizer,
data_file=None,
bot_name=None,
max_length=2048):
# Download data from https://huggingface.co/datasets/fnlp/moss-003-sft-data
if data_file is None:
data_file = './data/moss-003-sft-no-tools.jsonl'
dataset = MOSSSFTDataset(
data_file=data_file,
bot_name=bot_name,
tokenizer=tokenizer,
max_length=max_length)
return dataset
def moss_003_sft_no_plugins_data_collator(return_hf_format=False):
return partial(default_collate_fn, return_hf_format=return_hf_format)
def moss_003_sft_plugins_dataset(tokenizer,
data_file=None,
bot_name=None,
max_length=2048):
# Download data from https://huggingface.co/datasets/fnlp/moss-003-sft-data
if data_file is None:
data_file = './data/conversations_with_tools_with_inner_instruction_no_text2image_train_all_random_meta0.5_0.1_0.01_moss_0709.jsonl' # noqa: E501
dataset = MOSSSFTDataset(
data_file=data_file,
bot_name=bot_name,
tokenizer=tokenizer,
max_length=max_length)
return dataset
def moss_003_sft_plugins_data_collator(return_hf_format=False):
return partial(default_collate_fn, return_hf_format=return_hf_format)
|