{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# granite.materials.smi-TED - INFERENCE (Classification)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Install extra packages for notebook\n", "%pip install seaborn xgboost" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import sys\n", "sys.path.append('../inference')" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# materials.smi-ted\n", "from smi_ted_light.load import load_smi_ted\n", "\n", "# Data\n", "import torch\n", "import pandas as pd\n", "\n", "# Chemistry\n", "from rdkit import Chem\n", "from rdkit.Chem import PandasTools\n", "from rdkit.Chem import Descriptors\n", "PandasTools.RenderImagesInAllDataFrames(True)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "# function to canonicalize SMILES\n", "def normalize_smiles(smi, canonical=True, isomeric=False):\n", " try:\n", " normalized = Chem.MolToSmiles(\n", " Chem.MolFromSmiles(smi), canonical=canonical, isomericSmiles=isomeric\n", " )\n", " except:\n", " normalized = None\n", " return normalized" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Import smi-ted" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Random Seed: 12345\n", "Using Rotation Embedding\n", "Using Rotation Embedding\n", "Using Rotation Embedding\n", "Using Rotation Embedding\n", "Using Rotation Embedding\n", "Using Rotation Embedding\n", "Using Rotation Embedding\n", "Using Rotation Embedding\n", "Using Rotation Embedding\n", "Using Rotation Embedding\n", "Using Rotation Embedding\n", "Using Rotation Embedding\n", "Vocab size: 2393\n", "[INFERENCE MODE - smi-ted-Light]\n" ] } ], "source": [ "model_smi_ted = load_smi_ted(\n", " folder='../inference/smi_ted_light',\n", " ckpt_filename='smi-ted-Light_40.pt'\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## BBBP Dataset" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Experiments - Data Load" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "df_train = pd.read_csv(\"../finetune/moleculenet/bbbp/train.csv\")\n", "df_test = pd.read_csv(\"../finetune/moleculenet/bbbp/test.csv\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### SMILES canonization" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(1634, 5)\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "[22:56:14] Explicit valence for atom # 1 N, 4, is greater than permitted\n", "[22:56:14] Explicit valence for atom # 6 N, 4, is greater than permitted\n", "[22:56:14] WARNING: not removing hydrogen atom without neighbors\n", "[22:56:14] WARNING: not removing hydrogen atom without neighbors\n", "[22:56:14] WARNING: not removing hydrogen atom without neighbors\n", "[22:56:14] WARNING: not removing hydrogen atom without neighbors\n", "[22:56:14] WARNING: not removing hydrogen atom without neighbors\n", "[22:56:14] WARNING: not removing hydrogen atom without neighbors\n", "[22:56:14] Explicit valence for atom # 6 N, 4, is greater than permitted\n", "[22:56:14] WARNING: not removing hydrogen atom without neighbors\n", "[22:56:14] WARNING: not removing hydrogen atom without neighbors\n", "[22:56:14] WARNING: not removing hydrogen atom without neighbors\n", "[22:56:14] WARNING: not removing hydrogen atom without neighbors\n", "[22:56:14] Explicit valence for atom # 11 N, 4, is greater than permitted\n", "[22:56:14] WARNING: not removing hydrogen atom without neighbors\n", "[22:56:14] WARNING: not removing hydrogen atom without neighbors\n", "[22:56:14] Explicit valence for atom # 5 N, 4, is greater than permitted\n", "[22:56:14] WARNING: not removing hydrogen atom without neighbors\n", "[22:56:14] WARNING: not removing hydrogen atom without neighbors\n", "[22:56:14] WARNING: not removing hydrogen atom without neighbors\n", "[22:56:14] WARNING: not removing hydrogen atom without neighbors\n", "[22:56:14] WARNING: not removing hydrogen atom without neighbors\n", "[22:56:14] WARNING: not removing hydrogen atom without neighbors\n", "[22:56:14] WARNING: not removing hydrogen atom without neighbors\n", "[22:56:14] WARNING: not removing hydrogen atom without neighbors\n", "[22:56:14] WARNING: not removing hydrogen atom without neighbors\n", "[22:56:14] WARNING: not removing hydrogen atom without neighbors\n", "[22:56:14] WARNING: not removing hydrogen atom without neighbors\n", "[22:56:14] WARNING: not removing hydrogen atom without neighbors\n", "[22:56:14] WARNING: not removing hydrogen atom without neighbors\n", "[22:56:14] WARNING: not removing hydrogen atom without neighbors\n", "[22:56:14] WARNING: not removing hydrogen atom without neighbors\n", "[22:56:14] WARNING: not removing hydrogen atom without neighbors\n", "[22:56:14] WARNING: not removing hydrogen atom without neighbors\n", "[22:56:14] WARNING: not removing hydrogen atom without neighbors\n", "[22:56:14] WARNING: not removing hydrogen atom without neighbors\n", "[22:56:14] WARNING: not removing hydrogen atom without neighbors\n", "[22:56:14] WARNING: not removing hydrogen atom without neighbors\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
numnamep_npsmilesnorm_smiles
01Propanolol1[Cl].CC(C)NCC(O)COc1cccc2ccccc12CC(C)NCC(O)COc1cccc2ccccc12.[Cl]
12Terbutylchlorambucil1C(=O)(OC(C)(C)C)CCCc1ccc(cc1)N(CCCl)CCClCC(C)(C)OC(=O)CCCc1ccc(N(CCCl)CCCl)cc1
23407301c12c3c(N4CCN(C)CC4)c(F)cc1c(c(C(O)=O)cn2C(C)CO...CC1COc2c(N3CCN(C)CC3)c(F)cc3c(=O)c(C(=O)O)cn1c23
34241C1CCN(CC1)Cc1cccc(c1)OCCCNC(=O)CCC(=O)NCCCOc1cccc(CN2CCCCC2)c1
46cefoperazone1CCN1CCN(C(=O)N[C@@H](C(=O)N[C@H]2[C@H]3SCC(=C(...CCN1CCN(C(=O)NC(C(=O)NC2C(=O)N3C(C(=O)O)=C(CSc...
\n", "
" ], "text/plain": [ " num name p_np \\\n", "0 1 Propanolol 1 \n", "1 2 Terbutylchlorambucil 1 \n", "2 3 40730 1 \n", "3 4 24 1 \n", "4 6 cefoperazone 1 \n", "\n", " smiles \\\n", "0 [Cl].CC(C)NCC(O)COc1cccc2ccccc12 \n", "1 C(=O)(OC(C)(C)C)CCCc1ccc(cc1)N(CCCl)CCCl \n", "2 c12c3c(N4CCN(C)CC4)c(F)cc1c(c(C(O)=O)cn2C(C)CO... \n", "3 C1CCN(CC1)Cc1cccc(c1)OCCCNC(=O)C \n", "4 CCN1CCN(C(=O)N[C@@H](C(=O)N[C@H]2[C@H]3SCC(=C(... \n", "\n", " norm_smiles \n", "0 CC(C)NCC(O)COc1cccc2ccccc12.[Cl] \n", "1 CC(C)(C)OC(=O)CCCc1ccc(N(CCCl)CCCl)cc1 \n", "2 CC1COc2c(N3CCN(C)CC3)c(F)cc3c(=O)c(C(=O)O)cn1c23 \n", "3 CC(=O)NCCCOc1cccc(CN2CCCCC2)c1 \n", "4 CCN1CCN(C(=O)NC(C(=O)NC2C(=O)N3C(C(=O)O)=C(CSc... " ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_train['norm_smiles'] = df_train['smiles'].apply(normalize_smiles)\n", "df_train_normalized = df_train.dropna()\n", "print(df_train_normalized.shape)\n", "df_train_normalized.head()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(192, 5)\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "[22:56:17] Explicit valence for atom # 12 N, 4, is greater than permitted\n", "[22:56:17] Explicit valence for atom # 5 N, 4, is greater than permitted\n", "[22:56:17] WARNING: not removing hydrogen atom without neighbors\n", "[22:56:17] WARNING: not removing hydrogen atom without neighbors\n", "[22:56:17] WARNING: not removing hydrogen atom without neighbors\n", "[22:56:17] WARNING: not removing hydrogen atom without neighbors\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
numnamep_npsmilesnorm_smiles
013181C(Cl)ClClCCl
123SKF-936190c1cc2c(cc(CC3=CNC(=NC3=O)NCCSCc3oc(cc3)CN(C)C)...CN(C)Cc1ccc(CSCCNc2nc(=O)c(Cc3ccc4ccccc4c3)c[n...
236etomidate1CCOC(=O)c1cncn1C(C)c2ccccc2CCOC(=O)c1cncn1C(C)c1ccccc1
33711a0CN(C)c1cc(C2=NC(N)=NN2)ccn1CN(C)c1cc(-c2nc(N)n[nH]2)ccn1
479compound 451N1(Cc2cc(OCCCNc3oc4ccccc4n3)ccc2)CCCCC1c1cc(CN2CCCCC2)cc(OCCCNc2nc3ccccc3o2)c1
\n", "
" ], "text/plain": [ " num name p_np smiles \\\n", "0 13 18 1 C(Cl)Cl \n", "1 23 SKF-93619 0 c1cc2c(cc(CC3=CNC(=NC3=O)NCCSCc3oc(cc3)CN(C)C)... \n", "2 36 etomidate 1 CCOC(=O)c1cncn1C(C)c2ccccc2 \n", "3 37 11a 0 CN(C)c1cc(C2=NC(N)=NN2)ccn1 \n", "4 79 compound 45 1 N1(Cc2cc(OCCCNc3oc4ccccc4n3)ccc2)CCCCC1 \n", "\n", " norm_smiles \n", "0 ClCCl \n", "1 CN(C)Cc1ccc(CSCCNc2nc(=O)c(Cc3ccc4ccccc4c3)c[n... \n", "2 CCOC(=O)c1cncn1C(C)c1ccccc1 \n", "3 CN(C)c1cc(-c2nc(N)n[nH]2)ccn1 \n", "4 c1cc(CN2CCCCC2)cc(OCCCNc2nc3ccccc3o2)c1 " ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_test['norm_smiles'] = df_test['smiles'].apply(normalize_smiles)\n", "df_test_normalized = df_test.dropna()\n", "print(df_test_normalized.shape)\n", "df_test_normalized.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Embeddings extraction " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### smi-ted embeddings extraction" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 16/16 [00:21<00:00, 1.35s/it]\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0123456789...758759760761762763764765766767
00.437218-0.5917270.0643280.3740190.530676-0.6440671.3081360.0897720.7905240.208749...-1.325162-0.0835780.1695440.359247-0.6527420.720496-0.6741840.6930000.586143-0.159641
10.344508-0.4170090.0957450.3559590.573049-0.5902791.0696990.0677240.7888150.159197...-1.312421-0.1087320.2170200.303697-0.5989660.647903-0.6659670.7918040.620691-0.107859
20.429205-0.4635420.0564410.4499250.536788-0.7499061.1938160.0825960.8602760.162548...-1.304979-0.1486200.2420450.344730-0.7046360.644773-0.7810170.7372070.585380-0.101722
30.433097-0.5230780.0897280.4101270.543400-0.6430141.2038580.0341770.7694130.202445...-1.358915-0.0774630.2287100.317884-0.6802200.531601-0.7097990.7313860.567806-0.087713
40.388423-0.5059080.0725390.3665020.533689-0.7015591.0355540.0384190.8229170.163062...-1.271012-0.1764120.1197340.294143-0.6777210.647655-0.8444190.7563210.570513-0.240003
\n", "

5 rows × 768 columns

\n", "
" ], "text/plain": [ " 0 1 2 3 4 5 6 \\\n", "0 0.437218 -0.591727 0.064328 0.374019 0.530676 -0.644067 1.308136 \n", "1 0.344508 -0.417009 0.095745 0.355959 0.573049 -0.590279 1.069699 \n", "2 0.429205 -0.463542 0.056441 0.449925 0.536788 -0.749906 1.193816 \n", "3 0.433097 -0.523078 0.089728 0.410127 0.543400 -0.643014 1.203858 \n", "4 0.388423 -0.505908 0.072539 0.366502 0.533689 -0.701559 1.035554 \n", "\n", " 7 8 9 ... 758 759 760 761 \\\n", "0 0.089772 0.790524 0.208749 ... -1.325162 -0.083578 0.169544 0.359247 \n", "1 0.067724 0.788815 0.159197 ... -1.312421 -0.108732 0.217020 0.303697 \n", "2 0.082596 0.860276 0.162548 ... -1.304979 -0.148620 0.242045 0.344730 \n", "3 0.034177 0.769413 0.202445 ... -1.358915 -0.077463 0.228710 0.317884 \n", "4 0.038419 0.822917 0.163062 ... -1.271012 -0.176412 0.119734 0.294143 \n", "\n", " 762 763 764 765 766 767 \n", "0 -0.652742 0.720496 -0.674184 0.693000 0.586143 -0.159641 \n", "1 -0.598966 0.647903 -0.665967 0.791804 0.620691 -0.107859 \n", "2 -0.704636 0.644773 -0.781017 0.737207 0.585380 -0.101722 \n", "3 -0.680220 0.531601 -0.709799 0.731386 0.567806 -0.087713 \n", "4 -0.677721 0.647655 -0.844419 0.756321 0.570513 -0.240003 \n", "\n", "[5 rows x 768 columns]" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "with torch.no_grad():\n", " df_embeddings_train = model_smi_ted.encode(df_train_normalized['norm_smiles'])\n", "df_embeddings_train.head()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 1/1 [00:04<00:00, 4.23s/it]\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0123456789...758759760761762763764765766767
00.374249-0.319257-0.0070410.4447410.326734-0.7914761.121707-0.0824010.6114570.289225...-1.462539-0.3020550.295551-0.058293-0.8303190.545099-0.4602711.1211170.685016-0.452698
10.429158-0.5681040.1127390.3524290.512565-0.6041531.1818460.0679630.7869780.128077...-1.226941-0.0789270.2094680.266113-0.7622610.610685-0.7557050.7345500.592976-0.148252
20.411906-0.5104770.0730150.3468710.512772-0.6172521.1916210.0401030.7225770.188638...-1.300554-0.1507350.1482520.282791-0.6947120.556029-0.6606450.7712260.558996-0.000660
30.356793-0.5309590.0503500.4335930.592601-0.5735081.2218650.0254910.8331640.214604...-1.406141-0.1071650.2001310.289469-0.7701490.572746-0.7767390.8550640.662797-0.194417
40.422133-0.4906100.0443330.3678610.579025-0.6294091.1398240.0398230.7288250.145327...-1.312777-0.1050490.1752860.336176-0.7388130.530226-0.7633570.7649980.583681-0.109683
\n", "

5 rows × 768 columns

\n", "
" ], "text/plain": [ " 0 1 2 3 4 5 6 \\\n", "0 0.374249 -0.319257 -0.007041 0.444741 0.326734 -0.791476 1.121707 \n", "1 0.429158 -0.568104 0.112739 0.352429 0.512565 -0.604153 1.181846 \n", "2 0.411906 -0.510477 0.073015 0.346871 0.512772 -0.617252 1.191621 \n", "3 0.356793 -0.530959 0.050350 0.433593 0.592601 -0.573508 1.221865 \n", "4 0.422133 -0.490610 0.044333 0.367861 0.579025 -0.629409 1.139824 \n", "\n", " 7 8 9 ... 758 759 760 761 \\\n", "0 -0.082401 0.611457 0.289225 ... -1.462539 -0.302055 0.295551 -0.058293 \n", "1 0.067963 0.786978 0.128077 ... -1.226941 -0.078927 0.209468 0.266113 \n", "2 0.040103 0.722577 0.188638 ... -1.300554 -0.150735 0.148252 0.282791 \n", "3 0.025491 0.833164 0.214604 ... -1.406141 -0.107165 0.200131 0.289469 \n", "4 0.039823 0.728825 0.145327 ... -1.312777 -0.105049 0.175286 0.336176 \n", "\n", " 762 763 764 765 766 767 \n", "0 -0.830319 0.545099 -0.460271 1.121117 0.685016 -0.452698 \n", "1 -0.762261 0.610685 -0.755705 0.734550 0.592976 -0.148252 \n", "2 -0.694712 0.556029 -0.660645 0.771226 0.558996 -0.000660 \n", "3 -0.770149 0.572746 -0.776739 0.855064 0.662797 -0.194417 \n", "4 -0.738813 0.530226 -0.763357 0.764998 0.583681 -0.109683 \n", "\n", "[5 rows x 768 columns]" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "with torch.no_grad():\n", " df_embeddings_test = model_smi_ted.encode(df_test_normalized['norm_smiles'])\n", "df_embeddings_test.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Experiments - BBBP prediction using smi-ted latent spaces" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### XGBoost prediction using the whole Latent Space" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "from xgboost import XGBClassifier\n", "from sklearn.metrics import roc_auc_score" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
XGBClassifier(base_score=None, booster=None, callbacks=None,\n",
       "              colsample_bylevel=None, colsample_bynode=None,\n",
       "              colsample_bytree=None, device=None, early_stopping_rounds=None,\n",
       "              enable_categorical=False, eval_metric=None, feature_types=None,\n",
       "              gamma=None, grow_policy=None, importance_type=None,\n",
       "              interaction_constraints=None, learning_rate=0.04, max_bin=None,\n",
       "              max_cat_threshold=None, max_cat_to_onehot=None,\n",
       "              max_delta_step=None, max_depth=8, max_leaves=None,\n",
       "              min_child_weight=None, missing=nan, monotone_constraints=None,\n",
       "              multi_strategy=None, n_estimators=2000, n_jobs=None,\n",
       "              num_parallel_tree=None, random_state=None, ...)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ], "text/plain": [ "XGBClassifier(base_score=None, booster=None, callbacks=None,\n", " colsample_bylevel=None, colsample_bynode=None,\n", " colsample_bytree=None, device=None, early_stopping_rounds=None,\n", " enable_categorical=False, eval_metric=None, feature_types=None,\n", " gamma=None, grow_policy=None, importance_type=None,\n", " interaction_constraints=None, learning_rate=0.04, max_bin=None,\n", " max_cat_threshold=None, max_cat_to_onehot=None,\n", " max_delta_step=None, max_depth=8, max_leaves=None,\n", " min_child_weight=None, missing=nan, monotone_constraints=None,\n", " multi_strategy=None, n_estimators=2000, n_jobs=None,\n", " num_parallel_tree=None, random_state=None, ...)" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "xgb_predict = XGBClassifier(n_estimators=2000, learning_rate=0.04, max_depth=8)\n", "xgb_predict.fit(df_embeddings_train, df_train_normalized['p_np'])" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "# get XGBoost predictions\n", "y_prob = xgb_predict.predict_proba(df_embeddings_test)[:, 1]" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "ROC-AUC Score: 0.9194\n" ] } ], "source": [ "roc_auc = roc_auc_score(df_test_normalized[\"p_np\"], y_prob)\n", "print(f\"ROC-AUC Score: {roc_auc:.4f}\")" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.12" } }, "nbformat": 4, "nbformat_minor": 2 }