eduardosoares99's picture
Upload 159 files
02e480f verified
raw
history blame
3.5 kB
# Deep learning
import torch
from torch.utils.data import Dataset
from sklearn.metrics import confusion_matrix
# Data
import pandas as pd
import numpy as np
# Standard library
import os
# Chemistry
from rdkit import Chem
from rdkit.Chem import PandasTools
from rdkit.Chem import Descriptors
PandasTools.RenderImagesInAllDataFrames(True)
def normalize_smiles(smi, canonical=True, isomeric=False):
try:
normalized = Chem.MolToSmiles(
Chem.MolFromSmiles(smi), canonical=canonical, isomericSmiles=isomeric
)
except:
normalized = None
return normalized
class RMSELoss:
def __init__(self):
pass
def __call__(self, yhat, y):
return torch.sqrt(torch.mean((yhat-y)**2))
def RMSE(predictions, targets):
return np.sqrt(((predictions - targets) ** 2).mean())
def sensitivity(y_true, y_pred):
tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
return (tp/(tp+fn))
def specificity(y_true, y_pred):
tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
return (tn/(tn+fp))
def get_optim_groups(module, keep_decoder=False):
# setup optimizer
# separate out all parameters to those that will and won't experience regularizing weight decay
decay = set()
no_decay = set()
whitelist_weight_modules = (torch.nn.Linear,)
blacklist_weight_modules = (torch.nn.LayerNorm, torch.nn.Embedding)
for mn, m in module.named_modules():
for pn, p in m.named_parameters():
fpn = '%s.%s' % (mn, pn) if mn else pn # full param name
if not keep_decoder and 'decoder' in fpn: # exclude decoder components
continue
if pn.endswith('bias'):
# all biases will not be decayed
no_decay.add(fpn)
elif pn.endswith('weight') and isinstance(m, whitelist_weight_modules):
# weights of whitelist modules will be weight decayed
decay.add(fpn)
elif pn.endswith('weight') and isinstance(m, blacklist_weight_modules):
# weights of blacklist modules will NOT be weight decayed
no_decay.add(fpn)
# validate that we considered every parameter
param_dict = {pn: p for pn, p in module.named_parameters()}
# create the pytorch optimizer object
optim_groups = [
{"params": [param_dict[pn] for pn in sorted(list(decay))], "weight_decay": 0.0},
{"params": [param_dict[pn] for pn in sorted(list(no_decay))], "weight_decay": 0.0},
]
return optim_groups
class CustomDataset(Dataset):
def __init__(self, dataset, target):
self.dataset = dataset
self.target = target
def __len__(self):
return len(self.dataset)
def __getitem__(self, idx):
smiles = self.dataset['canon_smiles'].iloc[idx]
labels = self.dataset[self.target].iloc[idx]
return smiles, labels
class CustomDatasetMultitask(Dataset):
def __init__(self, dataset, targets):
self.dataset = dataset
self.targets = targets
def __len__(self):
return len(self.dataset)
def __getitem__(self, idx):
smiles = self.dataset['canon_smiles'].iloc[idx]
labels = self.dataset[self.targets].iloc[idx].to_numpy()
mask = [0.0 if np.isnan(x) else 1.0 for x in labels]
labels = [0.0 if np.isnan(x) else x for x in labels]
return smiles, torch.tensor(labels, dtype=torch.float32), torch.tensor(mask)