{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# granite.materials.smi-TED - Encoder & Decoder" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import sys\n", "sys.path.append('../inference')" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# materials.smi-ted (smi-ted)\n", "from smi_ted_light.load import load_smi_ted\n", "\n", "# Data\n", "import pandas as pd\n", "import numpy as np\n", "import torch\n", "\n", "# Chemistry\n", "from rdkit import Chem\n", "from rdkit.Chem import PandasTools\n", "from rdkit.Chem import Descriptors\n", "from rdkit.Chem import AllChem\n", "from rdkit.DataStructs import FingerprintSimilarity\n", "from rdkit.DataStructs import TanimotoSimilarity" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "# function to canonicalize SMILES\n", "def normalize_smiles(smi, canonical=True, isomeric=False):\n", " try:\n", " normalized = Chem.MolToSmiles(\n", " Chem.MolFromSmiles(smi), canonical=canonical, isomericSmiles=isomeric\n", " )\n", " except:\n", " normalized = None\n", " return normalized\n", "\n", "# function to calculate pairwise Tanimoto similarity\n", "def calculate_tanimoto_similarities(fps1, fps2):\n", " similarities = []\n", " for i in range(len(fps1)):\n", " sim = TanimotoSimilarity(fps1[i], fps2[i])\n", " similarities.append(sim)\n", " return similarities" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Load smi-ted" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Random Seed: 12345\n", "Using Rotation Embedding\n", "Using Rotation Embedding\n", "Using Rotation Embedding\n", "Using Rotation Embedding\n", "Using Rotation Embedding\n", "Using Rotation Embedding\n", "Using Rotation Embedding\n", "Using Rotation Embedding\n", "Using Rotation Embedding\n", "Using Rotation Embedding\n", "Using Rotation Embedding\n", "Using Rotation Embedding\n", "Vocab size: 2393\n", "[INFERENCE MODE - smi-ted-Light]\n" ] } ], "source": [ "model_smi_ted = load_smi_ted(\n", " folder='../inference/smi_ted_light',\n", " ckpt_filename='smi-ted-Light_40.pt'\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Load Dataset" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "df_moses = pd.read_csv(\"./data/moses_test.csv\", nrows=1000)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(1000, 1)\n" ] }, { "data": { "text/html": [ "
\n", " | SMILES | \n", "
---|---|
0 | \n", "CC1C2CCC(C2)C1CN(CCO)C(=O)c1ccc(Cl)cc1 | \n", "
1 | \n", "COc1ccc(-c2cc(=O)c3c(O)c(OC)c(OC)cc3o2)cc1O | \n", "
2 | \n", "CCOC(=O)c1ncn2c1CN(C)C(=O)c1cc(F)ccc1-2 | \n", "
3 | \n", "Clc1ccccc1-c1nc(-c2ccncc2)no1 | \n", "
4 | \n", "CC(C)(Oc1ccc(Cl)cc1)C(=O)OCc1cccc(CO)n1 | \n", "