File size: 5,699 Bytes
9afbc33 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 |
import chunk
import os
import warnings
import pandas as pd
from rdkit import Chem
# from rdkit.Chem import AllChem
from rdkit.Chem import rdFingerprintGenerator
from sklearn.ensemble import RandomForestClassifier
from tqdm.auto import tqdm
import numpy as np
import clamp
import torch
warnings.filterwarnings("ignore")
def generate_morgan_fingerprints(smiles_list, radius=4, n_bits=4048):
"""
Generate Morgan fingerprints for a list of SMILES.
"""
mfpgen = rdFingerprintGenerator.GetMorganGenerator(radius=radius,fpSize=n_bits)
mols = [Chem.MolFromSmiles(smi) for smi in smiles_list]
fps = []
for smiles, mol in zip(smiles_list, mols):
if mol is None:
print(smiles)
fps.append(None)
else:
fps.append(mfpgen.GetFingerprintAsNumPy(mol))
# np.array([mfpgen.GetFingerprintAsNumPy(mol) for mol in mols])
return fps
def rf(df, train_smiles, test_smiles):
"""
Train and test RF baseline model.
Parameters:
df : pd.DataFrame with 'SMILES' and 'Activity_label' columns
train_smiles : list of training set smiles
test_smiles : list of test set smiles
Returns:
preds : list of predicted labels for the test set
"""
train_df = df[df['SMILES'].isin(train_smiles)]
test_df = df[df['SMILES'].isin(test_smiles)]
# Generate Morgan fingerprints for training and test sets
X_train = generate_morgan_fingerprints(train_df['SMILES'])
X_test = generate_morgan_fingerprints(test_df['SMILES'])
# Extract labels
y_train = train_df['Activity'].values
# Train a Random Forest Classifier
clf = RandomForestClassifier(n_estimators=200, random_state=82)
clf.fit(X_train, y_train)
# Make predictions on the test set
try:
preds = clf.predict_proba(X_test)[:,1]
except Exception as e:
print(e)
print(test_df)
print(X_test)
return preds
def fh(smiles_list):
df = pd.read_csv('data/fh_predictions.csv')
preds = df[df['SMILES'].isin(smiles_list)]['Prediction'].tolist()
return preds
def drop_assays_with_limited_data(df, na_min=50, ni_min=100):
print('Drop assays with not enough datapoints...')
unique_uids = df['AID'].sort_values().unique() # Sorted unique targets
activity_counts = df.groupby('AID')['Activity'].value_counts().unstack().fillna(0) # matrix: rows=sorted targets, columns=nactive, ninactives
mask = ((activity_counts[1] >= na_min) & (activity_counts[0] >= ni_min) ) # Both nactives and ninactives above nmin
df = df[df['AID'].isin(unique_uids[mask])]
return df
def run(
n_actives : int,
n_inactives : int,
model : str = 'MHNfs',
task : str = 'UID',
input_file : str = '', # todo add path
output_dir : str = '', # todo add path
n_repeats : int = 3,
seed : int = 42
):
# Load data
data = pd.read_csv(input_file)
if task == 'AID':
data = drop_assays_with_limited_data(data, 30, 30)
# Output dir
output_dir = os.path.join(output_dir, model, task, f'{n_actives}+{n_inactives}x{n_repeats}')
print(output_dir)
os.makedirs(output_dir, exist_ok=True)
# Tasks
tasks = data[task].value_counts(ascending=True).index.tolist()
# print(tasks)
if model == 'MHNfs':
predictor = ActivityPredictor()
# Iterate over tasks
for t in tqdm(tasks):
# Output file
output_file = os.path.join(output_dir, f'{t}.csv')
if os.path.exists(output_file):
continue
# Data for task
df = data[data[task] == t]
# Iterate over replicates
results = []
for i in range(n_repeats):
# Select support sets and test molecules
actives = df.loc[df['Activity'] == 1, 'SMILES'].sample(n=n_actives, random_state=seed+i).tolist()
inactives = df.loc[df['Activity'] == 0, 'SMILES'].sample(n=n_inactives, random_state=seed+i).tolist()
test_smiles = df[~df.SMILES.isin(actives+inactives)].SMILES.tolist()
if model == 'RF':
preds = rf(df, actives+inactives, test_smiles)
else:
if len(test_smiles) > 10_000:
# MHNfs breaks for over 20_000 datapoints -> Use chunks to make predictions
chunk_size = 10_000
chunks = [test_smiles[i:i + chunk_size] for i in range(0, len(test_smiles), chunk_size)]
preds = []
for chunk in chunks:
preds.extend( predictor.predict(chunk, actives, inactives))
else:
preds = predictor.predict(test_smiles, actives, inactives)
d = {
'SMILES' : test_smiles,
'Label' : df[df.SMILES.isin(test_smiles)].Activity,
'Prediction' : preds,
'Fold' : [i] * len(test_smiles)
}
results.append(pd.DataFrame(d))
results = pd.concat(results)
results.to_csv(output_file, index=False)
if __name__ == '__main__':
mhnfs_path = # mhnfs_path + '/mhnfs'
benchmark_path = # project_path
import sys
sys.path.append(mhnfs_path)
from src.prediction_pipeline import ActivityPredictor
support_sets = [(1,7), (2,6), (4,4)]
models = ['RF', 'MHNfs']
tasks = ['AID', 'UID']
input_file = # preprocessed_data path + '/pubchem24_preprocessed_2.csv.gz'
for support_set in support_sets:
for model in models:
for task in tasks:
run(*support_set, task=task, model=model, input_file=input_file) |