import json import sys from datasets import load_dataset from ppo_hh import create_reward_fn import trlx from trlx.data.default_configs import ( ModelConfig, OptimizerConfig, SchedulerConfig, SFTConfig, TokenizerConfig, TrainConfig, TRLConfig, ) default_config = TRLConfig( train=TrainConfig( seq_length=1024, epochs=100, total_steps=10000, batch_size=4, checkpoint_interval=10000, eval_interval=1000, pipeline="PromptPipeline", trainer="AccelerateSFTTrainer", checkpoint_dir="checkpoints/sft_hh", ), model=ModelConfig(model_path="EleutherAI/gpt-j-6B", num_layers_unfrozen=-1), tokenizer=TokenizerConfig(tokenizer_path="EleutherAI/gpt-j-6B", truncation_side="left"), optimizer=OptimizerConfig(name="adamw", kwargs=dict(lr=1e-6, betas=(0.9, 0.95), eps=1.0e-8, weight_decay=1.0e-6)), scheduler=SchedulerConfig(name="cosine_annealing", kwargs=dict(T_max=100000000, eta_min=1e-6)), method=SFTConfig( name="sftconfig", gen_kwargs=dict(max_new_tokens=128, top_k=20, top_p=1.0, do_sample=True), ), ) def preprocess(sample): sample["chosen_sample"] = sample["prompt"] + sample["chosen"] return sample def main(hparams={}): config = TRLConfig.update(default_config, hparams) dataset = load_dataset("Dahoas/full-hh-rlhf").map(preprocess) reward_fn = create_reward_fn() trlx.train( config=config, samples=dataset["train"]["chosen_sample"], eval_prompts=dataset["test"]["prompt"][:280], metric_fn=lambda **kwargs: {"reward": reward_fn(**kwargs)}, stop_sequences=["Human:", "human:", "Assistant:", "assistant:"], ) if __name__ == "__main__": hparams = {} if len(sys.argv) == 1 else json.loads(sys.argv[1]) main(hparams)