|
|
|
|
|
|
|
|
|
|
|
import os |
|
|
|
from pprint import pprint |
|
import sys |
|
sys.path.append('/root/autodl-tmp/wjm/iupac-gpt') |
|
from tqdm import tqdm |
|
try: |
|
import iupac_gpt as gpt |
|
except ImportError: |
|
import sys |
|
sys.path.extend([".."]) |
|
import iupac_gpt as gpt |
|
import torch |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
hyperparams = {"batch_size": 64, "max_epochs": 10, "max_length": 1280, |
|
"learning_rate": 5e-4, "weight_decay": 0.0, |
|
"adam_eps": 1e-8, "adam_betas": (0.9, 0.999), |
|
"scheduler_T_max": 1_000, "final_learning_rate": 5e-8, |
|
"vocab_size": 1491, "min_frequency": 2, "top_p": 0.96, |
|
"n_layer": 8, "n_head": 8, "n_embd": 256} |
|
|
|
gpus = [0] |
|
num_workers = 24 |
|
|
|
|
|
|
|
|
|
|
|
|
|
device = 'gpu' |
|
train_dataloader,iupac_tokenizer = gpt.get_data_loader(is_train=1,dataset_filename = './pubchem_iupac_smile_gpt.csv') |
|
pbar = tqdm(train_dataloader) |
|
|
|
|
|
''' |
|
for inputs in pbar: |
|
src_label = Variable(inputs["labels"].to(device)) |
|
inputs = prepare_input(inputs,device) |
|
src = Variable(inputs["input_ids"].to(device)) |
|
#self.tokenizer._convert_token_to_id |
|
|
|
print(src[:,:].shape,src_label) |
|
''' |
|
tokenizer = iupac_tokenizer |
|
|
|
|
|
iupac_string = "2-amino-9-[4-hydroxy-3-(hydroxymethyl)-2-methylidenecyclopentyl]-1H-purin-6-one" |
|
iupac_encoded = tokenizer(iupac_string) |
|
iupac_encoded['input_ids'] = [2]+iupac_encoded['input_ids'] |
|
|
|
iupac_merges = [tokenizer.decode(i) for i in iupac_encoded['input_ids']] |
|
|
|
|
|
print(iupac_encoded['input_ids']) |
|
print(iupac_merges) |
|
|
|
print(tokenizer.unk_token_id,tokenizer.eos_token_id,tokenizer.unk_token,tokenizer.eos_token,tokenizer.vocab_size) |
|
|
|
batch = next(iter(pbar)) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from transformers import GPT2Config, GPT2LMHeadModel |
|
|
|
config = GPT2Config(vocab_size=tokenizer.vocab_size, |
|
bos_token_id=tokenizer.unk_token_id, |
|
eos_token_id=tokenizer.eos_token_id, |
|
n_layer=hyperparams["n_layer"], |
|
n_head=hyperparams["n_head"], |
|
n_embd=hyperparams["n_embd"], |
|
n_positions=hyperparams["max_length"], |
|
n_ctx=hyperparams["max_length"]) |
|
model = GPT2LMHeadModel(config) |
|
|
|
|
|
|
|
outputs = model(**batch) |
|
print(outputs.keys()) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from pytorch_lightning import Trainer |
|
from pytorch_lightning.callbacks.early_stopping import EarlyStopping |
|
|
|
checkpoint = "./checkpoints/iupac" |
|
|
|
|
|
''' |
|
trainer = Trainer( |
|
gpus=gpus, |
|
max_epochs=hyperparams["max_epochs"], |
|
callbacks=[EarlyStopping("ppl", 0.1, 3)], #[EarlyStopping("ppl", 0.2, 2)] |
|
auto_lr_find=False, # Set to True to search for optimal learning rate. |
|
auto_scale_batch_size=False, # Set to True to scale batch size |
|
# accelerator="dp" # Uncomment for GPU training. |
|
accelerator="gpu", #devices=4, |
|
strategy="ddp" |
|
) |
|
lit_model = gpt.GPT2LitModel( |
|
model, |
|
batch_size=hyperparams["batch_size"], |
|
learning_rate=hyperparams["learning_rate"], |
|
final_learning_rate=hyperparams["final_learning_rate"], |
|
weight_decay=hyperparams["weight_decay"], |
|
adam_eps=hyperparams["adam_eps"], |
|
adam_betas=hyperparams["adam_betas"], |
|
scheduler_T_max=hyperparams["scheduler_T_max"], |
|
save_model_every=1, checkpoint=checkpoint) |
|
trainer.fit(lit_model, train_dataloader) |
|
|
|
|
|
#model.module.save_pretrained('./pretrained') |
|
model.save_pretrained('./pretrained') |
|
|
|
''' |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import torch |
|
from bertviz import head_view |
|
|
|
input_ids_list = iupac_encoded['input_ids'] |
|
model = GPT2LMHeadModel.from_pretrained(checkpoint, output_attentions=True) |
|
attention = model(torch.LongTensor(input_ids_list))[-1] |
|
tokens = [tokenizer.decode(i) for i in input_ids_list] |
|
print(input_ids_list,attention,tokens) |
|
|
|
head_view(attention, tokens) |
|
|
|
|
|
|
|
from bertviz import model_view |
|
|
|
|
|
model_view(attention, tokens) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import tqdm |
|
|
|
model.eval() |
|
|
|
generated_smiles_list = [] |
|
n_generated = 50000 |
|
|
|
for _ in tqdm.tqdm(range(n_generated)): |
|
|
|
smiles_start = torch.LongTensor([[tokenizer.unk_token_id]]) |
|
|
|
generated_ids = model.generate(smiles_start, |
|
max_length=hyperparams["max_length"], |
|
do_sample=True,top_p=hyperparams["top_p"], |
|
repetition_penalty=1.2, |
|
pad_token_id=tokenizer.eos_token_id) |
|
|
|
generated_smiles = tokenizer.decode(generated_ids[0], |
|
skip_special_tokens=True) |
|
generated_smiles_list.append(generated_smiles) |
|
|
|
print(generated_smiles_list[:10]) |
|
|
|
|
|
import numpy as np |
|
import pandas as pd |
|
|
|
df2 = pd.DataFrame(generated_smiles_list, columns=['iupac']) |
|
|
|
df2.to_csv("iupacGPT2-gen50K.csv",index=None,sep="|") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|