File size: 4,947 Bytes
cf004a6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
"""
This file includes all necessary code to preprocess molecules (assumed to be in SMILES
format) and create descriptors which can be fed into MHNfs.    
"""

#---------------------------------------------------------------------------------------
# Dependencies
import numpy as np
import pandas as pd
import pickle
from typing import List
from rdkit import Chem, DataStructs
from rdkit.Chem.rdchem import Mol
from rdkit.Chem import Descriptors, rdFingerprintGenerator

from src.data_preprocessing.constants import USED_200_DESCR
from src.data_preprocessing.utils import Standardizer

#---------------------------------------------------------------------------------------
# Define main function

def preprocess_molecules(input_molecules: [str, List[str], pd.DataFrame]):
    """
    This function preprocesses molecules (assumed to be in SMILES format) and creates
    descriptors which can be fed into MHNfs.
    """

    # Load needed objects
    current_loc = __file__.rsplit("/",3)[0]
    with open(current_loc + "/assets/data_preprocessing_objects/scaler_fitted.pkl",
              "rb") as fl:
        scaler = pickle.load(fl)
    
    with open(current_loc + "/assets/data_preprocessing_objects/ecdfs.pkl", "rb") as fl:
        ecdfs = pickle.load(fl)
    
    # Ensure that input_molecules is an Iterable with strs
    input_smiles = handle_inputs(input_molecules)
    
    # Create cleanded rdkit mol objects
    input_molecules = create_cleaned_mol_objects(input_smiles)
    
    # Create fingerprints and descriptors
    ecfps = create_ecfp_fps(input_molecules)
    rdkit_descrs = create_rdkit_descriptors(input_molecules)
    
    # Create quantils
    rdkit_descr_quantils = create_quantils(rdkit_descrs, ecdfs)
    
    # Concatenate features
    raw_features = np.concatenate((ecfps, rdkit_descr_quantils), axis=1)
    
    # Normalize feature vectors
    normalized_features = scaler.transform(raw_features)
    
    # Return feature vectors
    return normalized_features
    
#---------------------------------------------------------------------------------------
# Define helper functions
def handle_inputs(input_molecules: [str, List[str], pd.DataFrame]):
    """
    This function handles the input molecules.
    """
    
    if isinstance(input_molecules, list):
        return input_molecules
     
    elif isinstance(input_molecules, pd.DataFrame):
        input_molecules.columns =  [c.lower() for c in input_molecules.columns]
        if "smiles" not in input_molecules.columns:
            raise ValueError(("Input DataFrame must have a column named 'Smiles'."))
        iterable = list(input_molecules["smiles"].values)
        return iterable
    
    elif isinstance(input_molecules, str):
        smiles_list = input_molecules.split(",")
        smiles_list_cleaned = [smiles.strip() for smiles in smiles_list]
        
        smiles_list_cleaned = [smiles for smiles in smiles_list_cleaned if smiles != ""]
        return smiles_list_cleaned
    else:
        raise TypeError(("Input molecules must be a string,a list of strings or a "
                         "pandas DataFrame."))

def create_ecfp_fps(mols: List[Mol]) -> np.ndarray:
    """
    This function ECFP fingerprints for a list of molecules.
    """
    ecfps = list()

    for mol in mols:
        fp_sparse_vec = rdFingerprintGenerator.GetCountFPs(
            [mol], fpType=rdFingerprintGenerator.MorganFP
        )[0]
        fp = np.zeros((0,), np.int8)
        DataStructs.ConvertToNumpyArray(fp_sparse_vec, fp)

        ecfps.append(fp)

    return np.array(ecfps)

def create_rdkit_descriptors(mols: List[Mol]) -> np.ndarray:
    """
    This function creates RDKit descriptors for a list of molecules.
    """
    rdkit_descriptors = list()

    for mol in mols:
        descrs = []
        for _, descr_calc_fn in Descriptors._descList:
            descrs.append(descr_calc_fn(mol))

        descrs = np.array(descrs)
        descrs = descrs[USED_200_DESCR]
        rdkit_descriptors.append(descrs)

    return np.array(rdkit_descriptors)

def create_quantils(raw_features: np.ndarray, ecdfs: list) -> np.ndarray:

    quantils = np.zeros_like(raw_features)

    for column in range(raw_features.shape[1]):
        raw_values = raw_features[:, column].reshape(-1)
        ecdf = ecdfs[column]
        q = ecdf(raw_values)
        quantils[:, column] = q

    return quantils

def create_cleaned_mol_objects(smiles: List[str]) -> List[Mol]:
    """
    This function creates cleaned RDKit mol objects from a list of SMILES.
    """
    sm = Standardizer(canon_taut=True)
    
    mols = list()
    for smile in smiles:
        #try:
        mol = Chem.MolFromSmiles(smile)
        standardized_mol, _ = sm.standardize_mol(mol)
        can_mol = Chem.MolFromSmiles(Chem.MolToSmiles(standardized_mol))
        mols.append(can_mol)
    return mols

#---------------------------------------------------------------------------------------