"""Huggingface model coverter to FastTransformer. Reference: https://github.com/NVIDIA/FasterTransformer/tree/main/examples/pytorch/gptj/utils """ import configparser from argparse import ArgumentParser from os import makedirs from pathlib import Path import numpy as np import torch from transformers import PretrainedConfig torch.set_printoptions(linewidth=130, sci_mode=False) np.set_printoptions(linewidth=130, suppress=True) # This converter is used to convert the huggingface moyix/codegen-350M-mono-gptj model. def savebin(param, save_path): if isinstance(param, torch.Tensor): param = param.cpu().float().numpy() np.squeeze(param).astype(np.float32).tofile(save_path + ".bin") def param2file(pt_param, layer_id, save_dir, dest_key): base_n = save_dir + "/model.layers." + str(layer_id) + "." save_path = base_n + dest_key savebin(pt_param, save_path) def param2distributed( pt_param, layer_id, save_dir, dest_key, n_inference_gpus, split_axis, ): np_param = pt_param.cpu().float().numpy() base_n = save_dir + "/model.layers." + str(layer_id) + "." save_path = base_n + dest_key split_param = np.split(np_param, n_inference_gpus, axis=split_axis) for i, p in enumerate(split_param): savebin(p, save_path + f".{i}") def save(w, save_dir, n_inference_gpus, n_layers, layer_id): makedirs(save_dir, exist_ok=True) savebin(w["transformer.wte.weight"], save_dir + "/model.wte") l = layer_id print(f"Saving layer {l + 1} / {n_layers}") base_k = "transformer.h." + str(l) + "." param2file(w[base_k + "ln_1.bias"], l, save_dir, "input_layernorm.bias") param2file(w[base_k + "ln_1.weight"], l, save_dir, "input_layernorm.weight") param2distributed( w[base_k + "mlp.fc_in.weight"].T, l, save_dir, "mlp.dense_h_to_4h.weight", n_inference_gpus, split_axis=-1, # split fast indx ) param2distributed( w[base_k + "mlp.fc_in.bias"], l, save_dir, "mlp.dense_h_to_4h.bias", n_inference_gpus, split_axis=-1, # split fast indx ) param2distributed( w[base_k + "mlp.fc_out.weight"].T, l, save_dir, "mlp.dense_4h_to_h.weight", n_inference_gpus, split_axis=0, # split slow indx ) param2file(w[base_k + "mlp.fc_out.bias"], l, save_dir, "mlp.dense_4h_to_h.bias") param2distributed( w[base_k + "attn.out_proj.weight"].T, l, save_dir, "attention.dense.weight", n_inference_gpus, split_axis=0, # split slow indx ) QKV_w = torch.stack( [ w[base_k + "attn.q_proj.weight"], w[base_k + "attn.k_proj.weight"], w[base_k + "attn.v_proj.weight"], ] ) # [qkv, n_heads * dim_head, latent_space] QKV_w = QKV_w.permute(2, 0, 1) param2distributed( QKV_w, l, save_dir, "attention.query_key_value.weight", n_inference_gpus, split_axis=-1, # split fast indx ) # Other unneeded per-layer params: # attn.attention.masked_bias = torch.tensor(-1e9) # attn.attention.bias = torch.tril(torch.ones(1, 1, 2048, 2048)) if __name__ == "__main__": parser = ArgumentParser( description="Convert GPT-J slim checkpoint to FasterTransformer", ) parser.add_argument( "--output-dir", help="Folder where binary files are stored", default="c-models/", ) parser.add_argument( "--ckpt-dir", help="File of GPT-J huggingface checkpoint", default="./" ) parser.add_argument( "--n-inference-gpus", help="Number of GPUs used for inference runtime", default=1, type=int, ) parser.add_argument( "--n-layers", help="Number of GPT-J decoder layer", default=20, type=int ) args = parser.parse_args() ckpt_file = args.ckpt_dir + "/pytorch_model.bin" checkpoint = torch.load(ckpt_file) print(f"loading from {ckpt_file}") out_path = args.output_dir output_dir = out_path + f"/{args.n_inference_gpus}-gpu/" print(f"saving to {output_dir}") config_file = args.ckpt_dir + "/config.json" hf_config = PretrainedConfig.from_json_file(config_file).to_dict() # NOTE: save parameters to config files (loaded by triton backends) config = configparser.ConfigParser() config["gptj"] = {} try: config["gptj"]["model_name"] = ( "gptj" if hf_config["_name_or_path"] == "" else hf_config["_name_or_path"] ) config["gptj"]["head_num"] = str(hf_config["n_head"]) n_embd = hf_config["n_embd"] config["gptj"]["size_per_head"] = str(n_embd // hf_config["n_head"]) config["gptj"]["inter_size"] = str(n_embd * 4) config["gptj"]["num_layer"] = str(hf_config["n_layer"]) rotary_dim = ( n_embd // hf_config["n_head"] if hf_config["rotary_dim"] is None else hf_config["rotary_dim"] ) config["gptj"]["rotary_embedding"] = str(hf_config["rotary_dim"]) config["gptj"]["vocab_size"] = str(hf_config["vocab_size"]) config["gptj"]["start_id"] = str(hf_config["bos_token_id"]) config["gptj"]["end_id"] = str(hf_config["eos_token_id"]) config["gptj"]["weight_data_type"] = "fp32" Path(output_dir).mkdir(exist_ok=True, parents=True) with open(output_dir + "/config.ini", "w") as configfile: config.write(configfile) except: print(f"Fail to save the config in config.ini.") for i in range(args.n_layers): save(checkpoint, output_dir, args.n_inference_gpus, args.n_layers, i) savebin( checkpoint["transformer.ln_f.weight"], output_dir + "/model.final_layernorm.weight", ) savebin( checkpoint["transformer.ln_f.bias"], output_dir + "/model.final_layernorm.bias" ) savebin(checkpoint["lm_head.weight"], output_dir + "/model.lm_head.weight") savebin(checkpoint["lm_head.bias"], output_dir + "/model.lm_head.bias") print("done")