In [None]:
#!/usr/bin/env python
# coding: utf-8

# Generative Pre-Training from Molecules

In [1]:
import os
#os.environ["CUDA_VISIBLE_DEVICES"] = ['1',"2"]
from pprint import pprint
import sys
sys.path.append('/home/jmwang/drugai/iupac-gpt')
from tqdm import tqdm
try:
    import iupac_gpt as gpt
except ImportError:
    import sys
    sys.path.extend([".."])  # Parent directory stores `smiles_gpt` package.
    import iupac_gpt as gpt
import torch

For demonstration purposes, we use only 10K subset of PubChem data made available by<br>
[ChemBERTa](https://arxiv.org/abs/2010.09885) developers. The original model was pretrained<br>
on the first 5M compounds with the following hyperparameters:<br>
```python<br>
hyperparams = {"batch_size": 128, "max_epochs": 2, "max_length": 512,<br>
               "learning_rate": 5e-4, "weight_decay": 0.0,<br>
               "adam_eps": 1e-8, "adam_betas": (0.9, 0.999),<br>
               "scheduler_T_max": 150_000, "final_learning_rate": 5e-8,<br>
               "vocab_size": 1_000, "min_frequency": 2, "top_p": 0.96,<br>
               "n_layer": 4, "n_head": 8, "n_embd": 512}<br>
```<br>
Tokenizer, model, optimizer, scheduler, and trainer hyperparameters.

In [2]:
hyperparams = {"batch_size": 128, "max_epochs": 10, "max_length": 1280,
               "learning_rate": 5e-4, "weight_decay": 0.0,
               "adam_eps": 1e-8, "adam_betas": (0.9, 0.999),
               "scheduler_T_max": 1_000, "final_learning_rate": 5e-8,
               "vocab_size": 1491, "min_frequency": 2, "top_p": 0.96,
               "n_layer": 8, "n_head": 8, "n_embd": 256}

In [3]:
gpus = [0,1,2]  # Specify either a list of GPU devices or an integer (0 for no GPU).
num_workers = 32  # Number of dataloader worker processes.
# ## Tokenization
# 
# `smiles_gpt.SMILESBPETokenizer` first splits SMILES strings into characters, runs
# byte-pair encoding, and augments the resulting list with `"<s>"` (beginning-of-SMILES) and
# `"</s>"` (end-of-SMILES) special tokens. `smiles_gpt.SMILESAlphabet` stores 72 possible
# characters as an initial vocabulary.
device = 'gpu'
train_dataloader,iupac_tokenizer = gpt.get_data_loader(is_train=1,dataset_filename = './pubchem_iupac_smile_gpt.csv')
pbar = tqdm(train_dataloader)  #train_dataloader.cuda()

iupac_vocab_size: 1491
training... 1491


  0%|                                                                                                                                                          | 0/144537 [00:00<?, ?it/s]


<br>
for inputs in pbar:<br>
    src_label = Variable(inputs["labels"].to(device))<br>
    inputs = prepare_input(inputs,device)<br>
    src = Variable(inputs["input_ids"].to(device))<br>
    #self.tokenizer._convert_token_to_id<br>
    print(src[:,:].shape,src_label)<br>


In [4]:
tokenizer = iupac_tokenizer
#start mark <unk> 2, end mark </s> 1,  pad   <pad> 0

In [5]:
iupac_string = "2-amino-4-(2-amino-3-hydroxyphenyl)-4-oxobutanoic acid"
iupac_encoded = tokenizer(iupac_string)
iupac_encoded['input_ids'] = [2]+iupac_encoded['input_ids']

In [6]:
iupac_merges = [tokenizer.decode(i) for i in iupac_encoded['input_ids']]
#iupac_encoded['attention_mask']

In [7]:
print(iupac_encoded['input_ids'])
print(iupac_merges)

[2, 5, 150, 165, 150, 7, 150, 154, 5, 150, 165, 150, 6, 150, 174, 158, 153, 150, 7, 150, 166, 173, 160, 169, 198, 1]
['<unk>', '2', '-', 'amino', '-', '4', '-', '(', '2', '-', 'amino', '-', '3', '-', 'hydroxy', 'phenyl', ')', '-', '4', '-', 'oxo', 'but', 'an', 'o', 'ic acid', '</s>']


In [8]:
print(tokenizer.unk_token_id,tokenizer.eos_token_id,tokenizer.unk_token,tokenizer.eos_token,tokenizer.vocab_size) #2 1 1491
# ## Data Module
batch = next(iter(pbar))

2 1 <unk> </s> 1491


  input_ids = torch.tensor(input_ids)
  batch[k] = pad_sequence([torch.tensor(r[k]) for r in records],
  0%|                                                                                                                                                          | 0/144537 [00:12<?, ?it/s]


## GPT-2 Model<br>
<br>
Now we load HuggingFace<br>
[`GPT2LMHeadModel`](https://huggingface.co/transformers/model_doc/gpt2.html#gpt2lmheadmodel)<br>
with the configuration composed of previously<br>
defined model hyperparameters. The model processes mini-batch of input ids and labels, then<br>
returns predictions and cross-entropy loss between labels and predictions.

In [9]:
from transformers import GPT2Config, GPT2LMHeadModel

In [10]:
config = GPT2Config(vocab_size=tokenizer.vocab_size,
                    bos_token_id=tokenizer.unk_token_id,
                    eos_token_id=tokenizer.eos_token_id,
                    n_layer=hyperparams["n_layer"],
                    n_head=hyperparams["n_head"],
                    n_embd=hyperparams["n_embd"],
                    n_positions=hyperparams["max_length"],
                    n_ctx=hyperparams["max_length"])
model = GPT2LMHeadModel(config)

odel= torch.nn.DataParallel(model.cuda(),device_ids=gpus,output_device=gpus[0])

In [11]:
outputs = model(**batch)
print(outputs.keys())

odict_keys(['loss', 'logits', 'past_key_values'])


'loss', 'logits', 'past_key_values']<br>
## Trainer<br>
<br>
GPT-2 is trained with autoregressive language modeling objective:<br>
$$<br>
P(\boldsymbol{s}) = P(s_1) \cdot P(s_2 | s_1) \cdots P(s_T | s_1, \ldots, s_{T-1}) =<br>
\prod_{t=1}^{T} P(s_t | s_{j < t}),<br>
$$<br>
where $\boldsymbol{s}$ is a tokenized (encoded) SMILES string, $s_t$ is a token from pretrained <br>
vocabulary $\mathcal{V}$.<br>
<br>
We use `pytorch_lightning.Trainer` to train GPT-2. Since `Trainer` requires lightning modules,<br>
we import our<br>
[`smiles_gpt.GPT2LitModel`](https://github.com/sanjaradylov/smiles-gpt/blob/master/smiles_gpt/language_modeling.py#L10)<br>
wrapper that implements training phases for<br>
`GPT2LMHeadModel`, configures an `Adam` optimizer with `CosineAnnealingLR` scheduler, and<br>
logs average perplexity every epoch.

In[8]:

In [12]:
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks.early_stopping import EarlyStopping

In [13]:
checkpoint = "../checkpoints/iupac"

In [14]:
trainer = Trainer(
    gpus=gpus,
    max_epochs=hyperparams["max_epochs"],
    callbacks=[EarlyStopping("ppl", 0.1, 3)],  #[EarlyStopping("ppl", 0.2, 2)]
    auto_lr_find=False,  # Set to True to search for optimal learning rate.
    auto_scale_batch_size=False,  # Set to True to scale batch size
    # accelerator="dp"  # Uncomment for GPU training.
    accelerator="gpu", #devices=4,
    strategy="dp"
)
lit_model = gpt.GPT2LitModel(
    model,
    batch_size=hyperparams["batch_size"],
    learning_rate=hyperparams["learning_rate"],
    final_learning_rate=hyperparams["final_learning_rate"],
    weight_decay=hyperparams["weight_decay"],
    adam_eps=hyperparams["adam_eps"],
    adam_betas=hyperparams["adam_betas"],
    scheduler_T_max=hyperparams["scheduler_T_max"],
    save_model_every=1, checkpoint=checkpoint)
trainer.fit(lit_model, train_dataloader)

MisconfigurationException: `Trainer(strategy='ddp')` or `Trainer(accelerator='ddp')` is not compatible with an interactive environment. Run your code as a script, or choose one of the compatible backends: dp, ddp_spawn, ddp_sharded_spawn, tpu_spawn. In case you are spawning processes yourself, make sure to include the Trainer creation inside the worker function.

odel.module.save_pretrained('./pretrained')

In [None]:
model.save_pretrained('./pretrained')

## Interpretability<br>
<br>
[BertViz](https://github.com/jessevig/bertviz) inspects attention heads of transformers<br>
capturing specific patterns in data. Each head can be representative of some syntactic<br>
or short-/long-term relationships between tokens.

In[9]:

In [14]:
import torch
from bertviz import head_view
print(head_view.__code__.co_varnames)

('attention', 'tokens', 'sentence_b_start', 'prettify_tokens', 'layer', 'heads', 'encoder_attention', 'decoder_attention', 'cross_attention', 'encoder_tokens', 'decoder_tokens', 'include_layers', 'html_action', 'slice_a', 'slice_b', 'vis_id', 'options', 'select_html', 'vis_html', 'd', 'attn_seq_len_left', 'attn_seq_len_right', 'params', '__location__', 'vis_js', 'html1', 'html2', 'html3', 'script', 'head_html')


In [22]:
input_ids_list = iupac_encoded['input_ids']
model = GPT2LMHeadModel.from_pretrained(checkpoint, output_attentions=True)
attention = model(torch.LongTensor(input_ids_list[1:-1]))[-1]
tokens = [tokenizer.decode(i) for i in input_ids_list]
#print(input_ids_list,attention,tokens)
# Don't worry if a snippet is not displayed---just rerun this cell.

a=head_view(attention = attention, tokens=tokens[1:-1],html_action='return')

with open("iupac_head_view.html", 'w') as file:
    file.write(a.data)
a

In [18]:
from bertviz import model_view
print(model_view.__code__.co_varnames)

('attention', 'tokens', 'sentence_b_start', 'prettify_tokens', 'display_mode', 'encoder_attention', 'decoder_attention', 'cross_attention', 'encoder_tokens', 'decoder_tokens', 'include_layers', 'include_heads', 'html_action', 'n_heads', 'slice_a', 'slice_b', 'vis_id', 'options', 'select_html', 'vis_html', 'd', 'attn_seq_len_left', 'attn_seq_len_right', 'params', '__location__', 'vis_js', 'html1', 'html2', 'html3', 'script', 'head_html')


Don't worry if a snippet is not displayed---just rerun this cell.

In [23]:
a=model_view(attention, tokens[1:-1],html_action='return')

with open("iupac_model_view.html", 'w') as file:
    file.write(a.data)
a

## Sampling<br>
<br>
Finally, we generate novel SMILES strings with top-$p$ sampling$-$i.e., sampling from the<br>
smallest vocabulary subset $\mathcal{V}^{(p)} \subset \mathcal{V}$ s.t. it takes up the most<br>
probable tokens whose cumulative probability mass exceeds $p$, $0 < p < 1$. Model<br>
terminates the procedure upon encountering `"</s>"` or reaching maximum number<br>
`hyperparams["max_length"]`. Special tokens are eventually removed.

In [None]:
import tqdm

In [None]:
model.eval()  # Set the base model to evaluation mode.

In [None]:
generated_smiles_list = []
n_generated = 30000

In [None]:
for _ in tqdm.tqdm(range(n_generated)):
    # Generate from "<unk>" so that the next token is arbitrary.
    smiles_start = torch.LongTensor([[tokenizer.unk_token_id]])
    # Get generated token IDs.
    generated_ids = model.generate(smiles_start,
                                   max_length=hyperparams["max_length"],
                                   do_sample=True,top_p=hyperparams["top_p"],
                                   repetition_penalty=1.2,
                                   pad_token_id=tokenizer.eos_token_id)
    # Decode the IDs into tokens and remove "<s>" and "</s>".
    generated_smiles = tokenizer.decode(generated_ids[0],
                                        skip_special_tokens=True)
    generated_smiles_list.append(generated_smiles)

In [None]:
print(generated_smiles_list[:10])

In [None]:
import numpy as np
import pandas as pd

In [None]:
df2 = pd.DataFrame(generated_smiles_list, columns=['iupac']) 

In [None]:
df2.to_csv("iupacGPT2-gen30K.csv",index=None,mode='a')