File size: 8,413 Bytes
295ff14
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
#!/usr/bin/env python
# coding: utf-8

# # Generative Pre-Training from Molecules

import os
#os.environ["CUDA_VISIBLE_DEVICES"] = ['1',"2"]
from pprint import pprint
import sys
sys.path.append('/root/autodl-tmp/wjm/iupac-gpt')
from tqdm import tqdm
try:
    import iupac_gpt as gpt
except ImportError:
    import sys
    sys.path.extend([".."])  # Parent directory stores `smiles_gpt` package.
    import iupac_gpt as gpt
import torch

# For demonstration purposes, we use only 10K subset of PubChem data made available by
# [ChemBERTa](https://arxiv.org/abs/2010.09885) developers. The original model was pretrained
# on the first 5M compounds with the following hyperparameters:
# ```python
# hyperparams = {"batch_size": 128, "max_epochs": 2, "max_length": 512,
#                "learning_rate": 5e-4, "weight_decay": 0.0,
#                "adam_eps": 1e-8, "adam_betas": (0.9, 0.999),
#                "scheduler_T_max": 150_000, "final_learning_rate": 5e-8,
#                "vocab_size": 1_000, "min_frequency": 2, "top_p": 0.96,
#                "n_layer": 4, "n_head": 8, "n_embd": 512}
# ```
# Tokenizer, model, optimizer, scheduler, and trainer hyperparameters.
hyperparams = {"batch_size": 128, "max_epochs": 10, "max_length": 1280,
               "learning_rate": 5e-4, "weight_decay": 0.0,
               "adam_eps": 1e-8, "adam_betas": (0.9, 0.999),
               "scheduler_T_max": 1_000, "final_learning_rate": 5e-8,
               "vocab_size": 1491, "min_frequency": 2, "top_p": 0.96,
               "n_layer": 8, "n_head": 8, "n_embd": 256}

gpus = [0]  # Specify either a list of GPU devices or an integer (0 for no GPU).
num_workers = 16  # Number of dataloader worker processes.
# ## Tokenization
# 
# `smiles_gpt.SMILESBPETokenizer` first splits SMILES strings into characters, runs
# byte-pair encoding, and augments the resulting list with `"<s>"` (beginning-of-SMILES) and
# `"</s>"` (end-of-SMILES) special tokens. `smiles_gpt.SMILESAlphabet` stores 72 possible
# characters as an initial vocabulary.
device = 'gpu'
train_dataloader,iupac_tokenizer = gpt.get_data_loader(is_train=1,dataset_filename = './pubchem_iupac_smile_gpt.csv')
pbar = tqdm(train_dataloader)  #train_dataloader.cuda()


'''
for inputs in pbar:
    src_label = Variable(inputs["labels"].to(device))
    inputs = prepare_input(inputs,device)
    src = Variable(inputs["input_ids"].to(device))
    #self.tokenizer._convert_token_to_id

    print(src[:,:].shape,src_label)
'''
tokenizer = iupac_tokenizer
#start mark <unk> 2, end mark </s> 1,  pad   <pad> 0

iupac_string = "2-amino-9-[4-hydroxy-3-(hydroxymethyl)-2-methylidenecyclopentyl]-1H-purin-6-one"
iupac_encoded = tokenizer(iupac_string)
iupac_encoded['input_ids'] = [2]+iupac_encoded['input_ids']

iupac_merges = [tokenizer.decode(i) for i in iupac_encoded['input_ids']]
#iupac_encoded['attention_mask']

print(iupac_encoded['input_ids'])
print(iupac_merges)

print(tokenizer.unk_token_id,tokenizer.eos_token_id,tokenizer.unk_token,tokenizer.eos_token,tokenizer.vocab_size) #2 1 1491
# ## Data Module
#batch = next(iter(pbar))


# ## GPT-2 Model
# 
# Now we load HuggingFace
# [`GPT2LMHeadModel`](https://huggingface.co/transformers/model_doc/gpt2.html#gpt2lmheadmodel)
# with the configuration composed of previously
# defined model hyperparameters. The model processes mini-batch of input ids and labels, then
# returns predictions and cross-entropy loss between labels and predictions.

from transformers import GPT2Config, GPT2LMHeadModel

config = GPT2Config(vocab_size=tokenizer.vocab_size,
                    bos_token_id=tokenizer.unk_token_id,
                    eos_token_id=tokenizer.eos_token_id,
                    n_layer=hyperparams["n_layer"],
                    n_head=hyperparams["n_head"],
                    n_embd=hyperparams["n_embd"],
                    n_positions=hyperparams["max_length"],
                    n_ctx=hyperparams["max_length"])
#model = GPT2LMHeadModel(config)

#model= torch.nn.DataParallel(model.cuda(),device_ids=gpus,output_device=gpus[0])

#outputs = model(**batch)
#print(outputs.keys())

#['loss', 'logits', 'past_key_values']
# ## Trainer
# 
# GPT-2 is trained with autoregressive language modeling objective:
# $$
# P(\boldsymbol{s}) = P(s_1) \cdot P(s_2 | s_1) \cdots P(s_T | s_1, \ldots, s_{T-1}) =
# \prod_{t=1}^{T} P(s_t | s_{j < t}),
# $$
# where $\boldsymbol{s}$ is a tokenized (encoded) SMILES string, $s_t$ is a token from pretrained 
# vocabulary $\mathcal{V}$.
# 
# We use `pytorch_lightning.Trainer` to train GPT-2. Since `Trainer` requires lightning modules,
# we import our
# [`smiles_gpt.GPT2LitModel`](https://github.com/sanjaradylov/smiles-gpt/blob/master/smiles_gpt/language_modeling.py#L10)
# wrapper that implements training phases for
# `GPT2LMHeadModel`, configures an `Adam` optimizer with `CosineAnnealingLR` scheduler, and
# logs average perplexity every epoch.
checkpoint = "../checkpoints/iupac"

model = GPT2LMHeadModel.from_pretrained('./pretrained',local_files_only=True)


from pytorch_lightning import Trainer
from pytorch_lightning.callbacks.early_stopping import EarlyStopping



trainer = Trainer(
    gpus=gpus,
    max_epochs=hyperparams["max_epochs"],
    callbacks=[EarlyStopping("ppl", 0.1, 3)],  #[EarlyStopping("ppl", 0.2, 2)]
    auto_lr_find=False,  # Set to True to search for optimal learning rate.
    auto_scale_batch_size=False,  # Set to True to scale batch size
    # accelerator="dp"  # Uncomment for GPU training.
    accelerator="gpu", #devices=4,
    strategy="ddp"
)
lit_model = gpt.GPT2LitModel(
    model,
    batch_size=hyperparams["batch_size"],
    learning_rate=hyperparams["learning_rate"],
    final_learning_rate=hyperparams["final_learning_rate"],
    weight_decay=hyperparams["weight_decay"],
    adam_eps=hyperparams["adam_eps"],
    adam_betas=hyperparams["adam_betas"],
    scheduler_T_max=hyperparams["scheduler_T_max"],
    save_model_every=1, checkpoint=checkpoint)
trainer.fit(lit_model, train_dataloader)


#model.module.save_pretrained('./pretrained')
model.save_pretrained('./pretrained')

# ## Interpretability
# 
# [BertViz](https://github.com/jessevig/bertviz) inspects attention heads of transformers
# capturing specific patterns in data. Each head can be representative of some syntactic
# or short-/long-term relationships between tokens.

# In[9]:


import torch
from bertviz import head_view

input_ids_list = iupac_encoded['input_ids']
model = GPT2LMHeadModel.from_pretrained(checkpoint, output_attentions=True)
attention = model(torch.LongTensor(input_ids_list))[-1]
tokens = [tokenizer.decode(i) for i in input_ids_list]
print(input_ids_list,attention,tokens)
# Don't worry if a snippet is not displayed---just rerun this cell.
head_view(attention, tokens)



from bertviz import model_view

# Don't worry if a snippet is not displayed---just rerun this cell.
model_view(attention, tokens)


# ## Sampling
# 
# Finally, we generate novel SMILES strings with top-$p$ sampling$-$i.e., sampling from the
# smallest vocabulary subset $\mathcal{V}^{(p)} \subset \mathcal{V}$ s.t. it takes up the most
# probable tokens whose cumulative probability mass exceeds $p$, $0 < p < 1$. Model
# terminates the procedure upon encountering `"</s>"` or reaching maximum number
# `hyperparams["max_length"]`. Special tokens are eventually removed.



import tqdm

model.eval()  # Set the base model to evaluation mode.

generated_smiles_list = []
n_generated = 50000

for _ in tqdm.tqdm(range(n_generated)):
    # Generate from "<unk>" so that the next token is arbitrary.
    smiles_start = torch.LongTensor([[tokenizer.unk_token_id]])
    # Get generated token IDs.
    generated_ids = model.generate(smiles_start,
                                   max_length=hyperparams["max_length"],
                                   do_sample=True,top_p=hyperparams["top_p"],
                                   repetition_penalty=1.2,
                                   pad_token_id=tokenizer.eos_token_id)
    # Decode the IDs into tokens and remove "<s>" and "</s>".
    generated_smiles = tokenizer.decode(generated_ids[0],
                                        skip_special_tokens=True)
    generated_smiles_list.append(generated_smiles)

print(generated_smiles_list[:10])


import numpy as np
import pandas as pd

df2 = pd.DataFrame(generated_smiles_list, columns=['iupac']) 

df2.to_csv("iupacGPT2-gen50K.csv",index=None,mode='a')