Spaces:
Runtime error
Runtime error
import os | |
import torch | |
from datasets import load_dataset | |
from reward_model.reward_model import GPTRewardModel | |
from transformers import AutoTokenizer | |
import trlx | |
from trlx.data.default_configs import ( | |
ILQLConfig, | |
ModelConfig, | |
OptimizerConfig, | |
SchedulerConfig, | |
TokenizerConfig, | |
TrainConfig, | |
TRLConfig, | |
) | |
default_config = TRLConfig( | |
train=TrainConfig( | |
seq_length=550, | |
batch_size=8, | |
epochs=100, | |
total_steps=5000, | |
checkpoint_interval=10000, | |
eval_interval=1000, | |
pipeline="PromptPipeline", | |
trainer="AccelerateILQLTrainer", | |
checkpoint_dir="ilql_summarize_t5", | |
), | |
model=ModelConfig(model_path="pvduy/flant5-xl_openai_tldr_sft", num_layers_unfrozen=-1, model_arch_type="seq2seq"), | |
tokenizer=TokenizerConfig(tokenizer_path="pvduy/flant5-xl_openai_tldr_sft", truncation_side="left"), | |
optimizer=OptimizerConfig(name="adamw", kwargs=dict(lr=1e-6, betas=(0.9, 0.95), eps=1.0e-8, weight_decay=1.0e-6)), | |
scheduler=SchedulerConfig(name="cosine_annealing", kwargs=dict(T_max=5000, eta_min=1e-6)), | |
method=ILQLConfig( | |
name="ilqlconfig", | |
tau=0.6, | |
gamma=0.99, | |
cql_scale=0.1, | |
awac_scale=1, | |
alpha=0.0001, | |
beta=0, | |
steps_for_target_q_sync=1, | |
two_qs=True, | |
gen_kwargs=dict(max_new_tokens=50, top_k=50, beta=[1, 2, 3], temperature=1.0), | |
), | |
) | |
REWARD_CHECKPOINT_PATH = "reward_model/rm_checkpoint/pytorch_model.bin" | |
if not os.path.exists(REWARD_CHECKPOINT_PATH): | |
os.makedirs("reward_model/rm_checkpoint", exist_ok=True) | |
os.system( | |
f"wget -O {REWARD_CHECKPOINT_PATH} \ | |
https://huggingface.co/CarperAI/openai_summarize_tldr_rm_checkpoint/resolve/main/pytorch_model.bin" | |
) | |
SFT_MODEL_PATH = "CarperAI/openai_summarize_tldr_sft" | |
def main(hparams={}): | |
config = TRLConfig.update(default_config, hparams) | |
rw_tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B") | |
rw_tokenizer.pad_token = rw_tokenizer.eos_token | |
rw_model = GPTRewardModel(SFT_MODEL_PATH) | |
rw_model.load_state_dict(torch.load(REWARD_CHECKPOINT_PATH)) | |
rw_model.half() | |
rw_model.eval() | |
rw_device = torch.device("cuda:{}".format(1)) # set reward model device | |
rw_model.to(rw_device) | |
def reward_fn(samples): | |
scores_list = [] | |
batch_size = 2 | |
for i in range(0, len(samples), batch_size): | |
sub_samples = samples[i : i + batch_size] | |
sub_samples = ["<|startoftext|>" + chosen + "<|endoftext|>" for chosen in sub_samples] | |
encodings_dict = rw_tokenizer( | |
sub_samples, | |
truncation=True, | |
max_length=config.train.seq_length, | |
padding="max_length", | |
return_tensors="pt", | |
) | |
input_ids = encodings_dict["input_ids"].to(rw_device) | |
attn_masks = encodings_dict["attention_mask"].to(rw_device) | |
input_ids = input_ids.repeat(2, 1) | |
attn_masks = attn_masks.repeat(2, 1) | |
with torch.no_grad(): | |
sub_scores = rw_model(input_ids=input_ids, attention_mask=attn_masks) | |
scores_list.append(sub_scores["chosen_end_scores"]) | |
scores = torch.cat(scores_list, dim=0) | |
return scores | |
def preprocess(sample): | |
sample["prompt_output"] = [ | |
[sample["prompt"] + " TL;DR:", sample["chosen"][7:]], | |
[sample["prompt"] + " TL;DR:", sample["rejected"][7:]], | |
] | |
sample["reward"] = [1, -1] | |
return sample | |
dataset = load_dataset("CarperAI/openai_summarize_comparisons") | |
dataset["train"] = dataset["train"] | |
dataset = dataset.map(preprocess) | |
prompts_outputs = sum(dataset["train"]["prompt_output"], []) | |
rewards = sum(dataset["train"]["reward"], []) | |
val_dataset = load_dataset("CarperAI/openai_summarize_tldr", split="valid") | |
eval_prompts = list(val_dataset["prompt"])[:1000] | |
trlx.train( | |
dataset=(prompts_outputs, rewards), | |
metric_fn=lambda samples, **kwargs: {"rewards": reward_fn(samples)}, | |
eval_prompts=eval_prompts, | |
config=config, | |
) | |
if __name__ == "__main__": | |
import json | |
import sys | |
hparams = {} if len(sys.argv) == 1 else json.loads(sys.argv[1]) | |
main(hparams) | |