diff --git "a/iupac-gpt/notebooks/iupac_language-modeling_train.ipynb" "b/iupac-gpt/notebooks/iupac_language-modeling_train.ipynb"
new file mode 100644--- /dev/null
+++ "b/iupac-gpt/notebooks/iupac_language-modeling_train.ipynb"
@@ -0,0 +1,1446 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#!/usr/bin/env python\n",
+ "# coding: utf-8"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Generative Pre-Training from Molecules"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "#os.environ[\"CUDA_VISIBLE_DEVICES\"] = ['1',\"2\"]\n",
+ "from pprint import pprint\n",
+ "import sys\n",
+ "sys.path.append('/home/jmwang/drugai/iupac-gpt')\n",
+ "from tqdm import tqdm\n",
+ "try:\n",
+ " import iupac_gpt as gpt\n",
+ "except ImportError:\n",
+ " import sys\n",
+ " sys.path.extend([\"..\"]) # Parent directory stores `smiles_gpt` package.\n",
+ " import iupac_gpt as gpt\n",
+ "import torch"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "For demonstration purposes, we use only 10K subset of PubChem data made available by
\n",
+ "[ChemBERTa](https://arxiv.org/abs/2010.09885) developers. The original model was pretrained
\n",
+ "on the first 5M compounds with the following hyperparameters:
\n",
+ "```python
\n",
+ "hyperparams = {\"batch_size\": 128, \"max_epochs\": 2, \"max_length\": 512,
\n",
+ " \"learning_rate\": 5e-4, \"weight_decay\": 0.0,
\n",
+ " \"adam_eps\": 1e-8, \"adam_betas\": (0.9, 0.999),
\n",
+ " \"scheduler_T_max\": 150_000, \"final_learning_rate\": 5e-8,
\n",
+ " \"vocab_size\": 1_000, \"min_frequency\": 2, \"top_p\": 0.96,
\n",
+ " \"n_layer\": 4, \"n_head\": 8, \"n_embd\": 512}
\n",
+ "```
\n",
+ "Tokenizer, model, optimizer, scheduler, and trainer hyperparameters."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "hyperparams = {\"batch_size\": 128, \"max_epochs\": 10, \"max_length\": 1280,\n",
+ " \"learning_rate\": 5e-4, \"weight_decay\": 0.0,\n",
+ " \"adam_eps\": 1e-8, \"adam_betas\": (0.9, 0.999),\n",
+ " \"scheduler_T_max\": 1_000, \"final_learning_rate\": 5e-8,\n",
+ " \"vocab_size\": 1491, \"min_frequency\": 2, \"top_p\": 0.96,\n",
+ " \"n_layer\": 8, \"n_head\": 8, \"n_embd\": 256}"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "iupac_vocab_size: 1491\n",
+ "training... 1491\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\r",
+ " 0%| | 0/144537 [00:00, ?it/s]"
+ ]
+ }
+ ],
+ "source": [
+ "gpus = [0,1,2] # Specify either a list of GPU devices or an integer (0 for no GPU).\n",
+ "num_workers = 32 # Number of dataloader worker processes.\n",
+ "# ## Tokenization\n",
+ "# \n",
+ "# `smiles_gpt.SMILESBPETokenizer` first splits SMILES strings into characters, runs\n",
+ "# byte-pair encoding, and augments the resulting list with `\"\"` (beginning-of-SMILES) and\n",
+ "# `\"\"` (end-of-SMILES) special tokens. `smiles_gpt.SMILESAlphabet` stores 72 possible\n",
+ "# characters as an initial vocabulary.\n",
+ "device = 'gpu'\n",
+ "train_dataloader,iupac_tokenizer = gpt.get_data_loader(is_train=1,dataset_filename = './pubchem_iupac_smile_gpt.csv')\n",
+ "pbar = tqdm(train_dataloader) #train_dataloader.cuda()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "\n",
+ "
\n",
+ "for inputs in pbar:
\n",
+ " src_label = Variable(inputs[\"labels\"].to(device))
\n",
+ " inputs = prepare_input(inputs,device)
\n",
+ " src = Variable(inputs[\"input_ids\"].to(device))
\n",
+ " #self.tokenizer._convert_token_to_id
\n",
+ " print(src[:,:].shape,src_label)
\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "tokenizer = iupac_tokenizer\n",
+ "#start mark 2, end mark 1, pad 0"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "iupac_string = \"2-amino-4-(2-amino-3-hydroxyphenyl)-4-oxobutanoic acid\"\n",
+ "iupac_encoded = tokenizer(iupac_string)\n",
+ "iupac_encoded['input_ids'] = [2]+iupac_encoded['input_ids']"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "iupac_merges = [tokenizer.decode(i) for i in iupac_encoded['input_ids']]\n",
+ "#iupac_encoded['attention_mask']"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[2, 5, 150, 165, 150, 7, 150, 154, 5, 150, 165, 150, 6, 150, 174, 158, 153, 150, 7, 150, 166, 173, 160, 169, 198, 1]\n",
+ "['', '2', '-', 'amino', '-', '4', '-', '(', '2', '-', 'amino', '-', '3', '-', 'hydroxy', 'phenyl', ')', '-', '4', '-', 'oxo', 'but', 'an', 'o', 'ic acid', '']\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(iupac_encoded['input_ids'])\n",
+ "print(iupac_merges)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "2 1 1491\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/home/jmwang/drugai/iupac-gpt/iupac_gpt/iupac_dataset.py:103: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n",
+ " input_ids = torch.tensor(input_ids)\n",
+ "/home/jmwang/drugai/iupac-gpt/iupac_gpt/iupac_tokenization_iupac.py:44: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n",
+ " batch[k] = pad_sequence([torch.tensor(r[k]) for r in records],\n",
+ " 0%| | 0/144537 [00:12, ?it/s]\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(tokenizer.unk_token_id,tokenizer.eos_token_id,tokenizer.unk_token,tokenizer.eos_token,tokenizer.vocab_size) #2 1 1491\n",
+ "# ## Data Module\n",
+ "batch = next(iter(pbar))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## GPT-2 Model
\n",
+ "
\n",
+ "Now we load HuggingFace
\n",
+ "[`GPT2LMHeadModel`](https://huggingface.co/transformers/model_doc/gpt2.html#gpt2lmheadmodel)
\n",
+ "with the configuration composed of previously
\n",
+ "defined model hyperparameters. The model processes mini-batch of input ids and labels, then
\n",
+ "returns predictions and cross-entropy loss between labels and predictions."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from transformers import GPT2Config, GPT2LMHeadModel"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "config = GPT2Config(vocab_size=tokenizer.vocab_size,\n",
+ " bos_token_id=tokenizer.unk_token_id,\n",
+ " eos_token_id=tokenizer.eos_token_id,\n",
+ " n_layer=hyperparams[\"n_layer\"],\n",
+ " n_head=hyperparams[\"n_head\"],\n",
+ " n_embd=hyperparams[\"n_embd\"],\n",
+ " n_positions=hyperparams[\"max_length\"],\n",
+ " n_ctx=hyperparams[\"max_length\"])\n",
+ "model = GPT2LMHeadModel(config)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "odel= torch.nn.DataParallel(model.cuda(),device_ids=gpus,output_device=gpus[0])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "odict_keys(['loss', 'logits', 'past_key_values'])\n"
+ ]
+ }
+ ],
+ "source": [
+ "outputs = model(**batch)\n",
+ "print(outputs.keys())"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "'loss', 'logits', 'past_key_values']
\n",
+ "## Trainer
\n",
+ "
\n",
+ "GPT-2 is trained with autoregressive language modeling objective:
\n",
+ "$$
\n",
+ "P(\\boldsymbol{s}) = P(s_1) \\cdot P(s_2 | s_1) \\cdots P(s_T | s_1, \\ldots, s_{T-1}) =
\n",
+ "\\prod_{t=1}^{T} P(s_t | s_{j < t}),
\n",
+ "$$
\n",
+ "where $\\boldsymbol{s}$ is a tokenized (encoded) SMILES string, $s_t$ is a token from pretrained
\n",
+ "vocabulary $\\mathcal{V}$.
\n",
+ "
\n",
+ "We use `pytorch_lightning.Trainer` to train GPT-2. Since `Trainer` requires lightning modules,
\n",
+ "we import our
\n",
+ "[`smiles_gpt.GPT2LitModel`](https://github.com/sanjaradylov/smiles-gpt/blob/master/smiles_gpt/language_modeling.py#L10)
\n",
+ "wrapper that implements training phases for
\n",
+ "`GPT2LMHeadModel`, configures an `Adam` optimizer with `CosineAnnealingLR` scheduler, and
\n",
+ "logs average perplexity every epoch."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "In[8]:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from pytorch_lightning import Trainer\n",
+ "from pytorch_lightning.callbacks.early_stopping import EarlyStopping"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "checkpoint = \"../checkpoints/iupac\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [
+ {
+ "ename": "MisconfigurationException",
+ "evalue": "`Trainer(strategy='ddp')` or `Trainer(accelerator='ddp')` is not compatible with an interactive environment. Run your code as a script, or choose one of the compatible backends: dp, ddp_spawn, ddp_sharded_spawn, tpu_spawn. In case you are spawning processes yourself, make sure to include the Trainer creation inside the worker function.",
+ "output_type": "error",
+ "traceback": [
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+ "\u001b[0;31mMisconfigurationException\u001b[0m Traceback (most recent call last)",
+ "Input \u001b[0;32mIn [14]\u001b[0m, in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0m trainer \u001b[38;5;241m=\u001b[39m \u001b[43mTrainer\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 2\u001b[0m \u001b[43m \u001b[49m\u001b[43mgpus\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mgpus\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 3\u001b[0m \u001b[43m \u001b[49m\u001b[43mmax_epochs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mhyperparams\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mmax_epochs\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 4\u001b[0m \u001b[43m \u001b[49m\u001b[43mcallbacks\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m[\u001b[49m\u001b[43mEarlyStopping\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mppl\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m0.1\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m3\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;66;43;03m#[EarlyStopping(\"ppl\", 0.2, 2)]\u001b[39;49;00m\n\u001b[1;32m 5\u001b[0m \u001b[43m \u001b[49m\u001b[43mauto_lr_find\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;66;43;03m# Set to True to search for optimal learning rate.\u001b[39;49;00m\n\u001b[1;32m 6\u001b[0m \u001b[43m \u001b[49m\u001b[43mauto_scale_batch_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;66;43;03m# Set to True to scale batch size\u001b[39;49;00m\n\u001b[1;32m 7\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;66;43;03m# accelerator=\"dp\" # Uncomment for GPU training.\u001b[39;49;00m\n\u001b[1;32m 8\u001b[0m \u001b[43m \u001b[49m\u001b[43maccelerator\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mgpu\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;66;43;03m#devices=4,\u001b[39;49;00m\n\u001b[1;32m 9\u001b[0m \u001b[43m \u001b[49m\u001b[43mstrategy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mddp\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\n\u001b[1;32m 10\u001b[0m \u001b[43m)\u001b[49m\n\u001b[1;32m 11\u001b[0m lit_model \u001b[38;5;241m=\u001b[39m gpt\u001b[38;5;241m.\u001b[39mGPT2LitModel(\n\u001b[1;32m 12\u001b[0m model,\n\u001b[1;32m 13\u001b[0m batch_size\u001b[38;5;241m=\u001b[39mhyperparams[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mbatch_size\u001b[39m\u001b[38;5;124m\"\u001b[39m],\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 19\u001b[0m scheduler_T_max\u001b[38;5;241m=\u001b[39mhyperparams[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mscheduler_T_max\u001b[39m\u001b[38;5;124m\"\u001b[39m],\n\u001b[1;32m 20\u001b[0m save_model_every\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1\u001b[39m, checkpoint\u001b[38;5;241m=\u001b[39mcheckpoint)\n\u001b[1;32m 21\u001b[0m trainer\u001b[38;5;241m.\u001b[39mfit(lit_model, train_dataloader)\n",
+ "File \u001b[0;32m~/anaconda3/envs/smiles-gpt/lib/python3.8/site-packages/pytorch_lightning/trainer/connectors/env_vars_connector.py:38\u001b[0m, in \u001b[0;36m_defaults_from_env_vars..insert_env_defaults\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 35\u001b[0m kwargs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mdict\u001b[39m(\u001b[38;5;28mlist\u001b[39m(env_variables\u001b[38;5;241m.\u001b[39mitems()) \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mlist\u001b[39m(kwargs\u001b[38;5;241m.\u001b[39mitems()))\n\u001b[1;32m 37\u001b[0m \u001b[38;5;66;03m# all args were already moved to kwargs\u001b[39;00m\n\u001b[0;32m---> 38\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+ "File \u001b[0;32m~/anaconda3/envs/smiles-gpt/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py:431\u001b[0m, in \u001b[0;36mTrainer.__init__\u001b[0;34m(self, logger, checkpoint_callback, enable_checkpointing, callbacks, default_root_dir, gradient_clip_val, gradient_clip_algorithm, process_position, num_nodes, num_processes, devices, gpus, auto_select_gpus, tpu_cores, ipus, log_gpu_memory, progress_bar_refresh_rate, enable_progress_bar, overfit_batches, track_grad_norm, check_val_every_n_epoch, fast_dev_run, accumulate_grad_batches, max_epochs, min_epochs, max_steps, min_steps, max_time, limit_train_batches, limit_val_batches, limit_test_batches, limit_predict_batches, val_check_interval, flush_logs_every_n_steps, log_every_n_steps, accelerator, strategy, sync_batchnorm, precision, enable_model_summary, weights_summary, weights_save_path, num_sanity_val_steps, resume_from_checkpoint, profiler, benchmark, deterministic, reload_dataloaders_every_n_epochs, reload_dataloaders_every_epoch, auto_lr_find, replace_sampler_ddp, detect_anomaly, auto_scale_batch_size, prepare_data_per_node, plugins, amp_backend, amp_level, move_metrics_to_cpu, multiple_trainloader_mode, stochastic_weight_avg, terminate_on_nan)\u001b[0m\n\u001b[1;32m 428\u001b[0m \u001b[38;5;66;03m# init connectors\u001b[39;00m\n\u001b[1;32m 429\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_data_connector \u001b[38;5;241m=\u001b[39m DataConnector(\u001b[38;5;28mself\u001b[39m, multiple_trainloader_mode)\n\u001b[0;32m--> 431\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_accelerator_connector \u001b[38;5;241m=\u001b[39m \u001b[43mAcceleratorConnector\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 432\u001b[0m \u001b[43m \u001b[49m\u001b[43mnum_processes\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 433\u001b[0m \u001b[43m \u001b[49m\u001b[43mdevices\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 434\u001b[0m \u001b[43m \u001b[49m\u001b[43mtpu_cores\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 435\u001b[0m \u001b[43m \u001b[49m\u001b[43mipus\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 436\u001b[0m \u001b[43m \u001b[49m\u001b[43maccelerator\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 437\u001b[0m \u001b[43m \u001b[49m\u001b[43mstrategy\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 438\u001b[0m \u001b[43m \u001b[49m\u001b[43mgpus\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 439\u001b[0m \u001b[43m \u001b[49m\u001b[43mgpu_ids\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 440\u001b[0m \u001b[43m \u001b[49m\u001b[43mnum_nodes\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 441\u001b[0m \u001b[43m \u001b[49m\u001b[43msync_batchnorm\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 442\u001b[0m \u001b[43m \u001b[49m\u001b[43mbenchmark\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 443\u001b[0m \u001b[43m \u001b[49m\u001b[43mreplace_sampler_ddp\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 444\u001b[0m \u001b[43m \u001b[49m\u001b[43mdeterministic\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 445\u001b[0m \u001b[43m \u001b[49m\u001b[43mprecision\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 446\u001b[0m \u001b[43m \u001b[49m\u001b[43mamp_backend\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 447\u001b[0m \u001b[43m \u001b[49m\u001b[43mamp_level\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 448\u001b[0m \u001b[43m \u001b[49m\u001b[43mplugins\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 449\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 450\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mlogger_connector \u001b[38;5;241m=\u001b[39m LoggerConnector(\u001b[38;5;28mself\u001b[39m, log_gpu_memory)\n\u001b[1;32m 451\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_callback_connector \u001b[38;5;241m=\u001b[39m CallbackConnector(\u001b[38;5;28mself\u001b[39m)\n",
+ "File \u001b[0;32m~/anaconda3/envs/smiles-gpt/lib/python3.8/site-packages/pytorch_lightning/trainer/connectors/accelerator_connector.py:164\u001b[0m, in \u001b[0;36mAcceleratorConnector.__init__\u001b[0;34m(self, num_processes, devices, tpu_cores, ipus, accelerator, strategy, gpus, gpu_ids, num_nodes, sync_batchnorm, benchmark, replace_sampler_ddp, deterministic, precision, amp_type, amp_level, plugins)\u001b[0m\n\u001b[1;32m 161\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mselect_accelerator_type()\n\u001b[1;32m 163\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstrategy \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 164\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_set_training_type_plugin\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 165\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 166\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mset_distributed_mode()\n",
+ "File \u001b[0;32m~/anaconda3/envs/smiles-gpt/lib/python3.8/site-packages/pytorch_lightning/trainer/connectors/accelerator_connector.py:311\u001b[0m, in \u001b[0;36mAcceleratorConnector._set_training_type_plugin\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 309\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_training_type_plugin \u001b[38;5;241m=\u001b[39m TrainingTypePluginsRegistry\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstrategy)\n\u001b[1;32m 310\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstrategy, \u001b[38;5;28mstr\u001b[39m):\n\u001b[0;32m--> 311\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mset_distributed_mode\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mstrategy\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 312\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstrategy, TrainingTypePlugin):\n\u001b[1;32m 313\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_training_type_plugin \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstrategy\n",
+ "File \u001b[0;32m~/anaconda3/envs/smiles-gpt/lib/python3.8/site-packages/pytorch_lightning/trainer/connectors/accelerator_connector.py:902\u001b[0m, in \u001b[0;36mAcceleratorConnector.set_distributed_mode\u001b[0;34m(self, strategy)\u001b[0m\n\u001b[1;32m 899\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_distrib_type \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 901\u001b[0m \u001b[38;5;66;03m# finished configuring self._distrib_type, check ipython environment\u001b[39;00m\n\u001b[0;32m--> 902\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcheck_interactive_compatibility\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 904\u001b[0m \u001b[38;5;66;03m# for DDP overwrite nb processes by requested GPUs\u001b[39;00m\n\u001b[1;32m 905\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_device_type \u001b[38;5;241m==\u001b[39m DeviceType\u001b[38;5;241m.\u001b[39mGPU \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_distrib_type \u001b[38;5;129;01min\u001b[39;00m (\n\u001b[1;32m 906\u001b[0m DistributedType\u001b[38;5;241m.\u001b[39mDDP,\n\u001b[1;32m 907\u001b[0m DistributedType\u001b[38;5;241m.\u001b[39mDDP_SPAWN,\n\u001b[1;32m 908\u001b[0m ):\n",
+ "File \u001b[0;32m~/anaconda3/envs/smiles-gpt/lib/python3.8/site-packages/pytorch_lightning/trainer/connectors/accelerator_connector.py:943\u001b[0m, in \u001b[0;36mAcceleratorConnector.check_interactive_compatibility\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 940\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mpytorch_lightning\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mutilities\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m _IS_INTERACTIVE\n\u001b[1;32m 942\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m _IS_INTERACTIVE \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_distrib_type \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_distrib_type\u001b[38;5;241m.\u001b[39mis_interactive_compatible():\n\u001b[0;32m--> 943\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m MisconfigurationException(\n\u001b[1;32m 944\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m`Trainer(strategy=\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_distrib_type\u001b[38;5;241m.\u001b[39mvalue\u001b[38;5;132;01m!r}\u001b[39;00m\u001b[38;5;124m)` or\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 945\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m `Trainer(accelerator=\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_distrib_type\u001b[38;5;241m.\u001b[39mvalue\u001b[38;5;132;01m!r}\u001b[39;00m\u001b[38;5;124m)` is not compatible with an interactive\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 946\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m environment. Run your code as a script, or choose one of the compatible backends:\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 947\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m, \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;241m.\u001b[39mjoin(DistributedType\u001b[38;5;241m.\u001b[39minteractive_compatible_types())\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 948\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m In case you are spawning processes yourself, make sure to include the Trainer\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 949\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m creation inside the worker function.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 950\u001b[0m )\n",
+ "\u001b[0;31mMisconfigurationException\u001b[0m: `Trainer(strategy='ddp')` or `Trainer(accelerator='ddp')` is not compatible with an interactive environment. Run your code as a script, or choose one of the compatible backends: dp, ddp_spawn, ddp_sharded_spawn, tpu_spawn. In case you are spawning processes yourself, make sure to include the Trainer creation inside the worker function."
+ ]
+ }
+ ],
+ "source": [
+ "trainer = Trainer(\n",
+ " gpus=gpus,\n",
+ " max_epochs=hyperparams[\"max_epochs\"],\n",
+ " callbacks=[EarlyStopping(\"ppl\", 0.1, 3)], #[EarlyStopping(\"ppl\", 0.2, 2)]\n",
+ " auto_lr_find=False, # Set to True to search for optimal learning rate.\n",
+ " auto_scale_batch_size=False, # Set to True to scale batch size\n",
+ " # accelerator=\"dp\" # Uncomment for GPU training.\n",
+ " accelerator=\"gpu\", #devices=4,\n",
+ " strategy=\"dp\"\n",
+ ")\n",
+ "lit_model = gpt.GPT2LitModel(\n",
+ " model,\n",
+ " batch_size=hyperparams[\"batch_size\"],\n",
+ " learning_rate=hyperparams[\"learning_rate\"],\n",
+ " final_learning_rate=hyperparams[\"final_learning_rate\"],\n",
+ " weight_decay=hyperparams[\"weight_decay\"],\n",
+ " adam_eps=hyperparams[\"adam_eps\"],\n",
+ " adam_betas=hyperparams[\"adam_betas\"],\n",
+ " scheduler_T_max=hyperparams[\"scheduler_T_max\"],\n",
+ " save_model_every=1, checkpoint=checkpoint)\n",
+ "trainer.fit(lit_model, train_dataloader)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "odel.module.save_pretrained('./pretrained')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "model.save_pretrained('./pretrained')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Interpretability \n",
+ " \n",
+ "[BertViz](https://github.com/jessevig/bertviz) inspects attention heads of transformers \n",
+ "capturing specific patterns in data. Each head can be representative of some syntactic \n",
+ "or short-/long-term relationships between tokens."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "In[9]:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "('attention', 'tokens', 'sentence_b_start', 'prettify_tokens', 'layer', 'heads', 'encoder_attention', 'decoder_attention', 'cross_attention', 'encoder_tokens', 'decoder_tokens', 'include_layers', 'html_action', 'slice_a', 'slice_b', 'vis_id', 'options', 'select_html', 'vis_html', 'd', 'attn_seq_len_left', 'attn_seq_len_right', 'params', '__location__', 'vis_js', 'html1', 'html2', 'html3', 'script', 'head_html')\n"
+ ]
+ }
+ ],
+ "source": [
+ "import torch\n",
+ "from bertviz import head_view\n",
+ "print(head_view.__code__.co_varnames)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ " \n",
+ " \n",
+ " \n",
+ " Layer: \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ "\n"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 22,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "input_ids_list = iupac_encoded['input_ids']\n",
+ "model = GPT2LMHeadModel.from_pretrained(checkpoint, output_attentions=True)\n",
+ "attention = model(torch.LongTensor(input_ids_list[1:-1]))[-1]\n",
+ "tokens = [tokenizer.decode(i) for i in input_ids_list]\n",
+ "#print(input_ids_list,attention,tokens)\n",
+ "# Don't worry if a snippet is not displayed---just rerun this cell.\n",
+ "\n",
+ "a=head_view(attention = attention, tokens=tokens[1:-1],html_action='return')\n",
+ "\n",
+ "with open(\"iupac_head_view.html\", 'w') as file:\n",
+ " file.write(a.data)\n",
+ "a"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "('attention', 'tokens', 'sentence_b_start', 'prettify_tokens', 'display_mode', 'encoder_attention', 'decoder_attention', 'cross_attention', 'encoder_tokens', 'decoder_tokens', 'include_layers', 'include_heads', 'html_action', 'n_heads', 'slice_a', 'slice_b', 'vis_id', 'options', 'select_html', 'vis_html', 'd', 'attn_seq_len_left', 'attn_seq_len_right', 'params', '__location__', 'vis_js', 'html1', 'html2', 'html3', 'script', 'head_html')\n"
+ ]
+ }
+ ],
+ "source": [
+ "from bertviz import model_view\n",
+ "print(model_view.__code__.co_varnames)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Don't worry if a snippet is not displayed---just rerun this cell."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ "\n"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 23,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "a=model_view(attention, tokens[1:-1],html_action='return')\n",
+ "\n",
+ "with open(\"iupac_model_view.html\", 'w') as file:\n",
+ " file.write(a.data)\n",
+ "a"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Sampling \n",
+ " \n",
+ "Finally, we generate novel SMILES strings with top-$p$ sampling$-$i.e., sampling from the \n",
+ "smallest vocabulary subset $\\mathcal{V}^{(p)} \\subset \\mathcal{V}$ s.t. it takes up the most \n",
+ "probable tokens whose cumulative probability mass exceeds $p$, $0 < p < 1$. Model \n",
+ "terminates the procedure upon encountering `\"\"` or reaching maximum number \n",
+ "`hyperparams[\"max_length\"]`. Special tokens are eventually removed."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import tqdm"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "model.eval() # Set the base model to evaluation mode."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "generated_smiles_list = []\n",
+ "n_generated = 30000"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "for _ in tqdm.tqdm(range(n_generated)):\n",
+ " # Generate from \"\" so that the next token is arbitrary.\n",
+ " smiles_start = torch.LongTensor([[tokenizer.unk_token_id]])\n",
+ " # Get generated token IDs.\n",
+ " generated_ids = model.generate(smiles_start,\n",
+ " max_length=hyperparams[\"max_length\"],\n",
+ " do_sample=True,top_p=hyperparams[\"top_p\"],\n",
+ " repetition_penalty=1.2,\n",
+ " pad_token_id=tokenizer.eos_token_id)\n",
+ " # Decode the IDs into tokens and remove \"\" and \"\".\n",
+ " generated_smiles = tokenizer.decode(generated_ids[0],\n",
+ " skip_special_tokens=True)\n",
+ " generated_smiles_list.append(generated_smiles)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "print(generated_smiles_list[:10])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import numpy as np\n",
+ "import pandas as pd"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df2 = pd.DataFrame(generated_smiles_list, columns=['iupac']) "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df2.to_csv(\"iupacGPT2-gen30K.csv\",index=None,mode='a')"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "smiles-gpt",
+ "language": "python",
+ "name": "smiles-gpt"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.8.13"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
|