|
from typing import Optional, Union, Callable, Iterator |
|
from collections.abc import Collection |
|
from functools import partial |
|
|
|
from datasets import load_dataset |
|
from litdata import optimize, TokensLoader |
|
from litgpt.tokenizer import Tokenizer |
|
from litdata import StreamingDataset |
|
|
|
from cognition_dataset import self_cognition_messages |
|
|
|
|
|
def batch_dict_iterator(path: Optional[str]=None, |
|
name: Optional[str]=None, |
|
data: Optional[Collection]=None, |
|
data_dir: Optional[str]=None, |
|
data_files: Optional[str]=None, |
|
keep_in_memory: bool=False, |
|
revision: Optional[str]=None, |
|
split: str='train', |
|
num_proc: Optional[int]=None, |
|
field: Optional[str]=None, |
|
transform: Optional[Callable]=None) -> Iterator[str]: |
|
assert isinstance(format, str) or callable(format) |
|
|
|
if path and not data: |
|
data = load_dataset(path=path, |
|
name=name, |
|
data_dir=data_dir, |
|
data_files=data_files, |
|
keep_in_memory=keep_in_memory, |
|
revision=revision, |
|
split=split, |
|
trust_remote_code=True, |
|
num_proc=num_proc) |
|
|
|
if data and field: |
|
data = data[field] |
|
|
|
if transform: |
|
data = [transform(n) for n in data] |
|
|
|
for n in data: |
|
text: list[str] | str = [] |
|
|
|
for m in n: |
|
fm = f'<im_start>{m["role"]}\n{m["content"]}<im_end>' |
|
text.append(fm) |
|
|
|
text = '\n'.join(text) |
|
yield text |
|
|
|
|
|
def batch_iterator(dataset_config: Union[list, dict]): |
|
if isinstance(dataset_config, dict): |
|
for text in batch_dict_iterator(**dataset_config): |
|
yield text |
|
elif isinstance(dataset_config, list): |
|
for dc in dataset_config: |
|
for text in batch_dict_iterator(**dc): |
|
yield text |
|
else: |
|
raise ValueError('') |
|
|
|
|
|
def tokenize_fn(dataset_config: Union[dict, list], tokenizer: Optional[Tokenizer]=None): |
|
assert isinstance(dataset_config, (dict, list)) |
|
|
|
for text in batch_iterator(dataset_config): |
|
text_ids = tokenizer.encode(text, bos=False, eos=True) |
|
yield text_ids |
|
|
|
|
|
roles_map = { |
|
'system': 'system', |
|
'user': 'user', |
|
'human': 'user', |
|
'assistant': 'assistant', |
|
'gpt': 'assistant', |
|
'AI': 'assistant', |
|
} |
|
|
|
datasets_configs = [ |
|
|
|
|
|
|
|
{'path': None, 'field': None, 'data': self_cognition_messages, 'transform': lambda r: [ |
|
{'role': 'user', 'content': r['instruction']}, |
|
{'role': 'assistant', 'content': r['output']}, |
|
]}, |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
[ |
|
{'path': 'arcee-ai/The-Tome', 'split': f'train[{i}%:{i + 20}%]', 'field': 'conversations', 'transform': lambda msgs: [ |
|
{'role': roles_map[m['from']], 'content': m['value']} |
|
for m in msgs |
|
]} |
|
for i in range(0, 100, 20) |
|
], |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
[ |
|
{'path': 'rombodawg/Everything_Instruct_Multilingual', 'split': f'train[{i}%:{i + 20}%]', 'transform': lambda r: [ |
|
{'role': 'system', 'content': r['instruction']}, |
|
{'role': 'user', 'content': r['input']}, |
|
{'role': 'assistant', 'content': r['output']}, |
|
]} |
|
for i in range(0, 100, 20) |
|
], |
|
|
|
|
|
|
|
|
|
|
|
[ |
|
{'path': 'ai2-adapt-dev/openmath-2-math', 'split': f'train[{i}%:{i + 10}%]', 'field': 'messages'} |
|
for i in range(0, 100, 10) |
|
], |
|
|
|
|
|
|
|
|
|
|
|
{'path': 'NousResearch/hermes-function-calling-v1', 'field': 'conversations', 'transform': lambda msgs: [ |
|
{'role': roles_map[m['from']], 'content': m['value']} |
|
for m in msgs |
|
]}, |
|
|
|
|
|
|
|
|
|
|
|
[ |
|
{'path': 'arcee-ai/agent-data', 'split': f'train[{i}%:{i + 20}%]', 'field': 'conversations', 'transform': lambda msgs: [ |
|
{'role': roles_map[m['from']], 'content': m['value']} |
|
for m in msgs |
|
]} |
|
for i in range(0, 100, 20) |
|
], |
|
|
|
|
|
|
|
|
|
[ |
|
|
|
|
|
{'path': 'AtlasUnified/Atlas-Reasoning', 'data_files': 'reasoning.csv', 'transform': lambda r: [ |
|
{'role': 'user', 'content': r['Prompt']}, |
|
{'role': 'assistant', 'content': r['Step-by-step reasoning'] + '\n' + r['Solution']}, |
|
]}, |
|
], |
|
|
|
|
|
|
|
|
|
[ |
|
|
|
|
|
{'path': 'thesven/gsm8k-reasoning', 'transform': lambda r: [ |
|
{'role': 'user', 'content': r['question']}, |
|
{'role': 'assistant', 'content': r['generation'] + '\n' + r['answer'] + '\n' + r['short_answer']}, |
|
]}, |
|
|
|
|
|
|
|
{'path': 'AlgorithmicResearchGroup/math_reasoning_autoformalization_track', 'transform': lambda r: [ |
|
{'role': 'user', 'content': r['informal_statement']}, |
|
{'role': 'assistant', 'content': r['informal_proof'] + '\n' + r['formal_proof']}, |
|
]}, |
|
|
|
|
|
|
|
{'path': 'KingNish/reasoning-base-20k', 'transform': lambda r: [ |
|
{'role': 'user', 'content': r['user']}, |
|
{'role': 'assistant', 'content': r['reasoning'] + '\n' + r['assistant']}, |
|
]}, |
|
|
|
|
|
|
|
{'path': 'Aarushhh/math-reasoning-10k', 'transform': lambda r: [ |
|
{'role': 'user', 'content': r['problem']}, |
|
{'role': 'assistant', 'content': r['plan'] + '\n' + r['solution']}, |
|
]}, |
|
], |
|
|
|
|
|
|
|
|
|
[ |
|
|
|
|
|
{'path': 'SkunkworksAI/reasoning-0.01', 'transform': lambda r: [ |
|
{'role': 'user', 'content': r['instruction']}, |
|
{'role': 'assistant', 'content': r['reasoning'] + '\n' + r['output']}, |
|
]}, |
|
|
|
|
|
|
|
{'path': 'Magpie-Align/Magpie-Reasoning-150K', 'transform': lambda r: [ |
|
{'role': 'user', 'content': r['instruction']}, |
|
{'role': 'assistant', 'content': r['response']}, |
|
]}, |
|
], |
|
|
|
|
|
|
|
|
|
[ |
|
|
|
{'path': 'dvilasuero/reflection-v1-gpt-4o-judge', 'transform': lambda r: [ |
|
{'role': 'system', 'content': r['system']}, |
|
{'role': 'user', 'content': r['prompt']}, |
|
{'role': 'assistant', 'content': r['response']}, |
|
]}, |
|
|
|
{'path': 'dvilasuero/reflection-v1-openai-o-mini-judge', 'transform': lambda r: [ |
|
{'role': 'system', 'content': r['system']}, |
|
{'role': 'user', 'content': r['prompt']}, |
|
{'role': 'assistant', 'content': r['response']}, |
|
]}, |
|
|
|
{'path': 'dvilasuero/reflection-v1-final-dedup', 'transform': lambda r: [ |
|
{'role': 'system', 'content': r['system']}, |
|
{'role': 'user', 'content': r['prompt']}, |
|
{'role': 'assistant', 'content': r['response']}, |
|
]}, |
|
|
|
{'path': 'flozi00/reflection-qwen2.5-72b-260924', 'transform': lambda r: [ |
|
r['system'][0], |
|
{'role': 'user', 'content': r['input']}, |
|
{'role': 'assistant', 'content': r['reflection'] + '\n' + r['output']}, |
|
]}, |
|
|
|
{'path': 'gretelai/synthetic-gsm8k-reflection-405b', 'split': 'train+test', 'transform': lambda r: [ |
|
{'role': 'user', 'content': r['question']}, |
|
{'role': 'assistant', 'content': r['answer_with_tags']}, |
|
]}, |
|
], |
|
] |
|
|
|
outputs = optimize( |
|
fn=partial(tokenize_fn, tokenizer=Tokenizer('..')), |
|
inputs=datasets_configs, |
|
output_dir='../contrain-data/', |
|
|
|
chunk_size=(1024 * 16000), |
|
num_workers=32, |
|
) |
|
|
|
|
|
|
|
|
|
dataset = StreamingDataset( |
|
input_dir='../contrain-data/', |
|
item_loader=TokensLoader(block_size=1024), |
|
) |
|
|
|
print(len(dataset)) |
|
|