{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# granite.materials.smi-TED - INFERENCE (Classification)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Install extra packages for notebook\n",
"%pip install seaborn xgboost"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import sys\n",
"sys.path.append('../inference')"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# materials.smi-ted\n",
"from smi_ted_light.load import load_smi_ted\n",
"\n",
"# Data\n",
"import torch\n",
"import pandas as pd\n",
"\n",
"# Chemistry\n",
"from rdkit import Chem\n",
"from rdkit.Chem import PandasTools\n",
"from rdkit.Chem import Descriptors\n",
"PandasTools.RenderImagesInAllDataFrames(True)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"# function to canonicalize SMILES\n",
"def normalize_smiles(smi, canonical=True, isomeric=False):\n",
" try:\n",
" normalized = Chem.MolToSmiles(\n",
" Chem.MolFromSmiles(smi), canonical=canonical, isomericSmiles=isomeric\n",
" )\n",
" except:\n",
" normalized = None\n",
" return normalized"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Import smi-ted"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Random Seed: 12345\n",
"Using Rotation Embedding\n",
"Using Rotation Embedding\n",
"Using Rotation Embedding\n",
"Using Rotation Embedding\n",
"Using Rotation Embedding\n",
"Using Rotation Embedding\n",
"Using Rotation Embedding\n",
"Using Rotation Embedding\n",
"Using Rotation Embedding\n",
"Using Rotation Embedding\n",
"Using Rotation Embedding\n",
"Using Rotation Embedding\n",
"Vocab size: 2393\n",
"[INFERENCE MODE - smi-ted-Light]\n"
]
}
],
"source": [
"model_smi_ted = load_smi_ted(\n",
" folder='../inference/smi_ted_light',\n",
" ckpt_filename='smi-ted-Light_40.pt'\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## BBBP Dataset"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Experiments - Data Load"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"df_train = pd.read_csv(\"../finetune/moleculenet/bbbp/train.csv\")\n",
"df_test = pd.read_csv(\"../finetune/moleculenet/bbbp/test.csv\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### SMILES canonization"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(1634, 5)\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"[22:56:14] Explicit valence for atom # 1 N, 4, is greater than permitted\n",
"[22:56:14] Explicit valence for atom # 6 N, 4, is greater than permitted\n",
"[22:56:14] WARNING: not removing hydrogen atom without neighbors\n",
"[22:56:14] WARNING: not removing hydrogen atom without neighbors\n",
"[22:56:14] WARNING: not removing hydrogen atom without neighbors\n",
"[22:56:14] WARNING: not removing hydrogen atom without neighbors\n",
"[22:56:14] WARNING: not removing hydrogen atom without neighbors\n",
"[22:56:14] WARNING: not removing hydrogen atom without neighbors\n",
"[22:56:14] Explicit valence for atom # 6 N, 4, is greater than permitted\n",
"[22:56:14] WARNING: not removing hydrogen atom without neighbors\n",
"[22:56:14] WARNING: not removing hydrogen atom without neighbors\n",
"[22:56:14] WARNING: not removing hydrogen atom without neighbors\n",
"[22:56:14] WARNING: not removing hydrogen atom without neighbors\n",
"[22:56:14] Explicit valence for atom # 11 N, 4, is greater than permitted\n",
"[22:56:14] WARNING: not removing hydrogen atom without neighbors\n",
"[22:56:14] WARNING: not removing hydrogen atom without neighbors\n",
"[22:56:14] Explicit valence for atom # 5 N, 4, is greater than permitted\n",
"[22:56:14] WARNING: not removing hydrogen atom without neighbors\n",
"[22:56:14] WARNING: not removing hydrogen atom without neighbors\n",
"[22:56:14] WARNING: not removing hydrogen atom without neighbors\n",
"[22:56:14] WARNING: not removing hydrogen atom without neighbors\n",
"[22:56:14] WARNING: not removing hydrogen atom without neighbors\n",
"[22:56:14] WARNING: not removing hydrogen atom without neighbors\n",
"[22:56:14] WARNING: not removing hydrogen atom without neighbors\n",
"[22:56:14] WARNING: not removing hydrogen atom without neighbors\n",
"[22:56:14] WARNING: not removing hydrogen atom without neighbors\n",
"[22:56:14] WARNING: not removing hydrogen atom without neighbors\n",
"[22:56:14] WARNING: not removing hydrogen atom without neighbors\n",
"[22:56:14] WARNING: not removing hydrogen atom without neighbors\n",
"[22:56:14] WARNING: not removing hydrogen atom without neighbors\n",
"[22:56:14] WARNING: not removing hydrogen atom without neighbors\n",
"[22:56:14] WARNING: not removing hydrogen atom without neighbors\n",
"[22:56:14] WARNING: not removing hydrogen atom without neighbors\n",
"[22:56:14] WARNING: not removing hydrogen atom without neighbors\n",
"[22:56:14] WARNING: not removing hydrogen atom without neighbors\n",
"[22:56:14] WARNING: not removing hydrogen atom without neighbors\n",
"[22:56:14] WARNING: not removing hydrogen atom without neighbors\n",
"[22:56:14] WARNING: not removing hydrogen atom without neighbors\n"
]
},
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" num \n",
" name \n",
" p_np \n",
" smiles \n",
" norm_smiles \n",
" \n",
" \n",
" \n",
" \n",
" 0 \n",
" 1 \n",
" Propanolol \n",
" 1 \n",
" [Cl].CC(C)NCC(O)COc1cccc2ccccc12 \n",
" CC(C)NCC(O)COc1cccc2ccccc12.[Cl] \n",
" \n",
" \n",
" 1 \n",
" 2 \n",
" Terbutylchlorambucil \n",
" 1 \n",
" C(=O)(OC(C)(C)C)CCCc1ccc(cc1)N(CCCl)CCCl \n",
" CC(C)(C)OC(=O)CCCc1ccc(N(CCCl)CCCl)cc1 \n",
" \n",
" \n",
" 2 \n",
" 3 \n",
" 40730 \n",
" 1 \n",
" c12c3c(N4CCN(C)CC4)c(F)cc1c(c(C(O)=O)cn2C(C)CO... \n",
" CC1COc2c(N3CCN(C)CC3)c(F)cc3c(=O)c(C(=O)O)cn1c23 \n",
" \n",
" \n",
" 3 \n",
" 4 \n",
" 24 \n",
" 1 \n",
" C1CCN(CC1)Cc1cccc(c1)OCCCNC(=O)C \n",
" CC(=O)NCCCOc1cccc(CN2CCCCC2)c1 \n",
" \n",
" \n",
" 4 \n",
" 6 \n",
" cefoperazone \n",
" 1 \n",
" CCN1CCN(C(=O)N[C@@H](C(=O)N[C@H]2[C@H]3SCC(=C(... \n",
" CCN1CCN(C(=O)NC(C(=O)NC2C(=O)N3C(C(=O)O)=C(CSc... \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
" num name p_np \\\n",
"0 1 Propanolol 1 \n",
"1 2 Terbutylchlorambucil 1 \n",
"2 3 40730 1 \n",
"3 4 24 1 \n",
"4 6 cefoperazone 1 \n",
"\n",
" smiles \\\n",
"0 [Cl].CC(C)NCC(O)COc1cccc2ccccc12 \n",
"1 C(=O)(OC(C)(C)C)CCCc1ccc(cc1)N(CCCl)CCCl \n",
"2 c12c3c(N4CCN(C)CC4)c(F)cc1c(c(C(O)=O)cn2C(C)CO... \n",
"3 C1CCN(CC1)Cc1cccc(c1)OCCCNC(=O)C \n",
"4 CCN1CCN(C(=O)N[C@@H](C(=O)N[C@H]2[C@H]3SCC(=C(... \n",
"\n",
" norm_smiles \n",
"0 CC(C)NCC(O)COc1cccc2ccccc12.[Cl] \n",
"1 CC(C)(C)OC(=O)CCCc1ccc(N(CCCl)CCCl)cc1 \n",
"2 CC1COc2c(N3CCN(C)CC3)c(F)cc3c(=O)c(C(=O)O)cn1c23 \n",
"3 CC(=O)NCCCOc1cccc(CN2CCCCC2)c1 \n",
"4 CCN1CCN(C(=O)NC(C(=O)NC2C(=O)N3C(C(=O)O)=C(CSc... "
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_train['norm_smiles'] = df_train['smiles'].apply(normalize_smiles)\n",
"df_train_normalized = df_train.dropna()\n",
"print(df_train_normalized.shape)\n",
"df_train_normalized.head()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(192, 5)\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"[22:56:17] Explicit valence for atom # 12 N, 4, is greater than permitted\n",
"[22:56:17] Explicit valence for atom # 5 N, 4, is greater than permitted\n",
"[22:56:17] WARNING: not removing hydrogen atom without neighbors\n",
"[22:56:17] WARNING: not removing hydrogen atom without neighbors\n",
"[22:56:17] WARNING: not removing hydrogen atom without neighbors\n",
"[22:56:17] WARNING: not removing hydrogen atom without neighbors\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" num \n",
" name \n",
" p_np \n",
" smiles \n",
" norm_smiles \n",
" \n",
" \n",
" \n",
" \n",
" 0 \n",
" 13 \n",
" 18 \n",
" 1 \n",
" C(Cl)Cl \n",
" ClCCl \n",
" \n",
" \n",
" 1 \n",
" 23 \n",
" SKF-93619 \n",
" 0 \n",
" c1cc2c(cc(CC3=CNC(=NC3=O)NCCSCc3oc(cc3)CN(C)C)... \n",
" CN(C)Cc1ccc(CSCCNc2nc(=O)c(Cc3ccc4ccccc4c3)c[n... \n",
" \n",
" \n",
" 2 \n",
" 36 \n",
" etomidate \n",
" 1 \n",
" CCOC(=O)c1cncn1C(C)c2ccccc2 \n",
" CCOC(=O)c1cncn1C(C)c1ccccc1 \n",
" \n",
" \n",
" 3 \n",
" 37 \n",
" 11a \n",
" 0 \n",
" CN(C)c1cc(C2=NC(N)=NN2)ccn1 \n",
" CN(C)c1cc(-c2nc(N)n[nH]2)ccn1 \n",
" \n",
" \n",
" 4 \n",
" 79 \n",
" compound 45 \n",
" 1 \n",
" N1(Cc2cc(OCCCNc3oc4ccccc4n3)ccc2)CCCCC1 \n",
" c1cc(CN2CCCCC2)cc(OCCCNc2nc3ccccc3o2)c1 \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
" num name p_np smiles \\\n",
"0 13 18 1 C(Cl)Cl \n",
"1 23 SKF-93619 0 c1cc2c(cc(CC3=CNC(=NC3=O)NCCSCc3oc(cc3)CN(C)C)... \n",
"2 36 etomidate 1 CCOC(=O)c1cncn1C(C)c2ccccc2 \n",
"3 37 11a 0 CN(C)c1cc(C2=NC(N)=NN2)ccn1 \n",
"4 79 compound 45 1 N1(Cc2cc(OCCCNc3oc4ccccc4n3)ccc2)CCCCC1 \n",
"\n",
" norm_smiles \n",
"0 ClCCl \n",
"1 CN(C)Cc1ccc(CSCCNc2nc(=O)c(Cc3ccc4ccccc4c3)c[n... \n",
"2 CCOC(=O)c1cncn1C(C)c1ccccc1 \n",
"3 CN(C)c1cc(-c2nc(N)n[nH]2)ccn1 \n",
"4 c1cc(CN2CCCCC2)cc(OCCCNc2nc3ccccc3o2)c1 "
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_test['norm_smiles'] = df_test['smiles'].apply(normalize_smiles)\n",
"df_test_normalized = df_test.dropna()\n",
"print(df_test_normalized.shape)\n",
"df_test_normalized.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Embeddings extraction "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### smi-ted embeddings extraction"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 16/16 [00:21<00:00, 1.35s/it]\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" 0 \n",
" 1 \n",
" 2 \n",
" 3 \n",
" 4 \n",
" 5 \n",
" 6 \n",
" 7 \n",
" 8 \n",
" 9 \n",
" ... \n",
" 758 \n",
" 759 \n",
" 760 \n",
" 761 \n",
" 762 \n",
" 763 \n",
" 764 \n",
" 765 \n",
" 766 \n",
" 767 \n",
" \n",
" \n",
" \n",
" \n",
" 0 \n",
" 0.437218 \n",
" -0.591727 \n",
" 0.064328 \n",
" 0.374019 \n",
" 0.530676 \n",
" -0.644067 \n",
" 1.308136 \n",
" 0.089772 \n",
" 0.790524 \n",
" 0.208749 \n",
" ... \n",
" -1.325162 \n",
" -0.083578 \n",
" 0.169544 \n",
" 0.359247 \n",
" -0.652742 \n",
" 0.720496 \n",
" -0.674184 \n",
" 0.693000 \n",
" 0.586143 \n",
" -0.159641 \n",
" \n",
" \n",
" 1 \n",
" 0.344508 \n",
" -0.417009 \n",
" 0.095745 \n",
" 0.355959 \n",
" 0.573049 \n",
" -0.590279 \n",
" 1.069699 \n",
" 0.067724 \n",
" 0.788815 \n",
" 0.159197 \n",
" ... \n",
" -1.312421 \n",
" -0.108732 \n",
" 0.217020 \n",
" 0.303697 \n",
" -0.598966 \n",
" 0.647903 \n",
" -0.665967 \n",
" 0.791804 \n",
" 0.620691 \n",
" -0.107859 \n",
" \n",
" \n",
" 2 \n",
" 0.429205 \n",
" -0.463542 \n",
" 0.056441 \n",
" 0.449925 \n",
" 0.536788 \n",
" -0.749906 \n",
" 1.193816 \n",
" 0.082596 \n",
" 0.860276 \n",
" 0.162548 \n",
" ... \n",
" -1.304979 \n",
" -0.148620 \n",
" 0.242045 \n",
" 0.344730 \n",
" -0.704636 \n",
" 0.644773 \n",
" -0.781017 \n",
" 0.737207 \n",
" 0.585380 \n",
" -0.101722 \n",
" \n",
" \n",
" 3 \n",
" 0.433097 \n",
" -0.523078 \n",
" 0.089728 \n",
" 0.410127 \n",
" 0.543400 \n",
" -0.643014 \n",
" 1.203858 \n",
" 0.034177 \n",
" 0.769413 \n",
" 0.202445 \n",
" ... \n",
" -1.358915 \n",
" -0.077463 \n",
" 0.228710 \n",
" 0.317884 \n",
" -0.680220 \n",
" 0.531601 \n",
" -0.709799 \n",
" 0.731386 \n",
" 0.567806 \n",
" -0.087713 \n",
" \n",
" \n",
" 4 \n",
" 0.388423 \n",
" -0.505908 \n",
" 0.072539 \n",
" 0.366502 \n",
" 0.533689 \n",
" -0.701559 \n",
" 1.035554 \n",
" 0.038419 \n",
" 0.822917 \n",
" 0.163062 \n",
" ... \n",
" -1.271012 \n",
" -0.176412 \n",
" 0.119734 \n",
" 0.294143 \n",
" -0.677721 \n",
" 0.647655 \n",
" -0.844419 \n",
" 0.756321 \n",
" 0.570513 \n",
" -0.240003 \n",
" \n",
" \n",
"
\n",
"
5 rows × 768 columns
\n",
"
"
],
"text/plain": [
" 0 1 2 3 4 5 6 \\\n",
"0 0.437218 -0.591727 0.064328 0.374019 0.530676 -0.644067 1.308136 \n",
"1 0.344508 -0.417009 0.095745 0.355959 0.573049 -0.590279 1.069699 \n",
"2 0.429205 -0.463542 0.056441 0.449925 0.536788 -0.749906 1.193816 \n",
"3 0.433097 -0.523078 0.089728 0.410127 0.543400 -0.643014 1.203858 \n",
"4 0.388423 -0.505908 0.072539 0.366502 0.533689 -0.701559 1.035554 \n",
"\n",
" 7 8 9 ... 758 759 760 761 \\\n",
"0 0.089772 0.790524 0.208749 ... -1.325162 -0.083578 0.169544 0.359247 \n",
"1 0.067724 0.788815 0.159197 ... -1.312421 -0.108732 0.217020 0.303697 \n",
"2 0.082596 0.860276 0.162548 ... -1.304979 -0.148620 0.242045 0.344730 \n",
"3 0.034177 0.769413 0.202445 ... -1.358915 -0.077463 0.228710 0.317884 \n",
"4 0.038419 0.822917 0.163062 ... -1.271012 -0.176412 0.119734 0.294143 \n",
"\n",
" 762 763 764 765 766 767 \n",
"0 -0.652742 0.720496 -0.674184 0.693000 0.586143 -0.159641 \n",
"1 -0.598966 0.647903 -0.665967 0.791804 0.620691 -0.107859 \n",
"2 -0.704636 0.644773 -0.781017 0.737207 0.585380 -0.101722 \n",
"3 -0.680220 0.531601 -0.709799 0.731386 0.567806 -0.087713 \n",
"4 -0.677721 0.647655 -0.844419 0.756321 0.570513 -0.240003 \n",
"\n",
"[5 rows x 768 columns]"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"with torch.no_grad():\n",
" df_embeddings_train = model_smi_ted.encode(df_train_normalized['norm_smiles'])\n",
"df_embeddings_train.head()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 1/1 [00:04<00:00, 4.23s/it]\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" 0 \n",
" 1 \n",
" 2 \n",
" 3 \n",
" 4 \n",
" 5 \n",
" 6 \n",
" 7 \n",
" 8 \n",
" 9 \n",
" ... \n",
" 758 \n",
" 759 \n",
" 760 \n",
" 761 \n",
" 762 \n",
" 763 \n",
" 764 \n",
" 765 \n",
" 766 \n",
" 767 \n",
" \n",
" \n",
" \n",
" \n",
" 0 \n",
" 0.374249 \n",
" -0.319257 \n",
" -0.007041 \n",
" 0.444741 \n",
" 0.326734 \n",
" -0.791476 \n",
" 1.121707 \n",
" -0.082401 \n",
" 0.611457 \n",
" 0.289225 \n",
" ... \n",
" -1.462539 \n",
" -0.302055 \n",
" 0.295551 \n",
" -0.058293 \n",
" -0.830319 \n",
" 0.545099 \n",
" -0.460271 \n",
" 1.121117 \n",
" 0.685016 \n",
" -0.452698 \n",
" \n",
" \n",
" 1 \n",
" 0.429158 \n",
" -0.568104 \n",
" 0.112739 \n",
" 0.352429 \n",
" 0.512565 \n",
" -0.604153 \n",
" 1.181846 \n",
" 0.067963 \n",
" 0.786978 \n",
" 0.128077 \n",
" ... \n",
" -1.226941 \n",
" -0.078927 \n",
" 0.209468 \n",
" 0.266113 \n",
" -0.762261 \n",
" 0.610685 \n",
" -0.755705 \n",
" 0.734550 \n",
" 0.592976 \n",
" -0.148252 \n",
" \n",
" \n",
" 2 \n",
" 0.411906 \n",
" -0.510477 \n",
" 0.073015 \n",
" 0.346871 \n",
" 0.512772 \n",
" -0.617252 \n",
" 1.191621 \n",
" 0.040103 \n",
" 0.722577 \n",
" 0.188638 \n",
" ... \n",
" -1.300554 \n",
" -0.150735 \n",
" 0.148252 \n",
" 0.282791 \n",
" -0.694712 \n",
" 0.556029 \n",
" -0.660645 \n",
" 0.771226 \n",
" 0.558996 \n",
" -0.000660 \n",
" \n",
" \n",
" 3 \n",
" 0.356793 \n",
" -0.530959 \n",
" 0.050350 \n",
" 0.433593 \n",
" 0.592601 \n",
" -0.573508 \n",
" 1.221865 \n",
" 0.025491 \n",
" 0.833164 \n",
" 0.214604 \n",
" ... \n",
" -1.406141 \n",
" -0.107165 \n",
" 0.200131 \n",
" 0.289469 \n",
" -0.770149 \n",
" 0.572746 \n",
" -0.776739 \n",
" 0.855064 \n",
" 0.662797 \n",
" -0.194417 \n",
" \n",
" \n",
" 4 \n",
" 0.422133 \n",
" -0.490610 \n",
" 0.044333 \n",
" 0.367861 \n",
" 0.579025 \n",
" -0.629409 \n",
" 1.139824 \n",
" 0.039823 \n",
" 0.728825 \n",
" 0.145327 \n",
" ... \n",
" -1.312777 \n",
" -0.105049 \n",
" 0.175286 \n",
" 0.336176 \n",
" -0.738813 \n",
" 0.530226 \n",
" -0.763357 \n",
" 0.764998 \n",
" 0.583681 \n",
" -0.109683 \n",
" \n",
" \n",
"
\n",
"
5 rows × 768 columns
\n",
"
"
],
"text/plain": [
" 0 1 2 3 4 5 6 \\\n",
"0 0.374249 -0.319257 -0.007041 0.444741 0.326734 -0.791476 1.121707 \n",
"1 0.429158 -0.568104 0.112739 0.352429 0.512565 -0.604153 1.181846 \n",
"2 0.411906 -0.510477 0.073015 0.346871 0.512772 -0.617252 1.191621 \n",
"3 0.356793 -0.530959 0.050350 0.433593 0.592601 -0.573508 1.221865 \n",
"4 0.422133 -0.490610 0.044333 0.367861 0.579025 -0.629409 1.139824 \n",
"\n",
" 7 8 9 ... 758 759 760 761 \\\n",
"0 -0.082401 0.611457 0.289225 ... -1.462539 -0.302055 0.295551 -0.058293 \n",
"1 0.067963 0.786978 0.128077 ... -1.226941 -0.078927 0.209468 0.266113 \n",
"2 0.040103 0.722577 0.188638 ... -1.300554 -0.150735 0.148252 0.282791 \n",
"3 0.025491 0.833164 0.214604 ... -1.406141 -0.107165 0.200131 0.289469 \n",
"4 0.039823 0.728825 0.145327 ... -1.312777 -0.105049 0.175286 0.336176 \n",
"\n",
" 762 763 764 765 766 767 \n",
"0 -0.830319 0.545099 -0.460271 1.121117 0.685016 -0.452698 \n",
"1 -0.762261 0.610685 -0.755705 0.734550 0.592976 -0.148252 \n",
"2 -0.694712 0.556029 -0.660645 0.771226 0.558996 -0.000660 \n",
"3 -0.770149 0.572746 -0.776739 0.855064 0.662797 -0.194417 \n",
"4 -0.738813 0.530226 -0.763357 0.764998 0.583681 -0.109683 \n",
"\n",
"[5 rows x 768 columns]"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"with torch.no_grad():\n",
" df_embeddings_test = model_smi_ted.encode(df_test_normalized['norm_smiles'])\n",
"df_embeddings_test.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Experiments - BBBP prediction using smi-ted latent spaces"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### XGBoost prediction using the whole Latent Space"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"from xgboost import XGBClassifier\n",
"from sklearn.metrics import roc_auc_score"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"XGBClassifier(base_score=None, booster=None, callbacks=None,\n",
" colsample_bylevel=None, colsample_bynode=None,\n",
" colsample_bytree=None, device=None, early_stopping_rounds=None,\n",
" enable_categorical=False, eval_metric=None, feature_types=None,\n",
" gamma=None, grow_policy=None, importance_type=None,\n",
" interaction_constraints=None, learning_rate=0.04, max_bin=None,\n",
" max_cat_threshold=None, max_cat_to_onehot=None,\n",
" max_delta_step=None, max_depth=8, max_leaves=None,\n",
" min_child_weight=None, missing=nan, monotone_constraints=None,\n",
" multi_strategy=None, n_estimators=2000, n_jobs=None,\n",
" num_parallel_tree=None, random_state=None, ...) In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org. XGBClassifieriFitted XGBClassifier(base_score=None, booster=None, callbacks=None,\n",
" colsample_bylevel=None, colsample_bynode=None,\n",
" colsample_bytree=None, device=None, early_stopping_rounds=None,\n",
" enable_categorical=False, eval_metric=None, feature_types=None,\n",
" gamma=None, grow_policy=None, importance_type=None,\n",
" interaction_constraints=None, learning_rate=0.04, max_bin=None,\n",
" max_cat_threshold=None, max_cat_to_onehot=None,\n",
" max_delta_step=None, max_depth=8, max_leaves=None,\n",
" min_child_weight=None, missing=nan, monotone_constraints=None,\n",
" multi_strategy=None, n_estimators=2000, n_jobs=None,\n",
" num_parallel_tree=None, random_state=None, ...) "
],
"text/plain": [
"XGBClassifier(base_score=None, booster=None, callbacks=None,\n",
" colsample_bylevel=None, colsample_bynode=None,\n",
" colsample_bytree=None, device=None, early_stopping_rounds=None,\n",
" enable_categorical=False, eval_metric=None, feature_types=None,\n",
" gamma=None, grow_policy=None, importance_type=None,\n",
" interaction_constraints=None, learning_rate=0.04, max_bin=None,\n",
" max_cat_threshold=None, max_cat_to_onehot=None,\n",
" max_delta_step=None, max_depth=8, max_leaves=None,\n",
" min_child_weight=None, missing=nan, monotone_constraints=None,\n",
" multi_strategy=None, n_estimators=2000, n_jobs=None,\n",
" num_parallel_tree=None, random_state=None, ...)"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"xgb_predict = XGBClassifier(n_estimators=2000, learning_rate=0.04, max_depth=8)\n",
"xgb_predict.fit(df_embeddings_train, df_train_normalized['p_np'])"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"# get XGBoost predictions\n",
"y_prob = xgb_predict.predict_proba(df_embeddings_test)[:, 1]"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"ROC-AUC Score: 0.9194\n"
]
}
],
"source": [
"roc_auc = roc_auc_score(df_test_normalized[\"p_np\"], y_prob)\n",
"print(f\"ROC-AUC Score: {roc_auc:.4f}\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.12"
}
},
"nbformat": 4,
"nbformat_minor": 2
}