mhnfs / pubchem_experiment /make_predictions.py
Tschoui's picture
Upload 3 files
9afbc33 verified
import chunk
import os
import warnings
import pandas as pd
from rdkit import Chem
# from rdkit.Chem import AllChem
from rdkit.Chem import rdFingerprintGenerator
from sklearn.ensemble import RandomForestClassifier
from tqdm.auto import tqdm
import numpy as np
import clamp
import torch
warnings.filterwarnings("ignore")
def generate_morgan_fingerprints(smiles_list, radius=4, n_bits=4048):
"""
Generate Morgan fingerprints for a list of SMILES.
"""
mfpgen = rdFingerprintGenerator.GetMorganGenerator(radius=radius,fpSize=n_bits)
mols = [Chem.MolFromSmiles(smi) for smi in smiles_list]
fps = []
for smiles, mol in zip(smiles_list, mols):
if mol is None:
print(smiles)
fps.append(None)
else:
fps.append(mfpgen.GetFingerprintAsNumPy(mol))
# np.array([mfpgen.GetFingerprintAsNumPy(mol) for mol in mols])
return fps
def rf(df, train_smiles, test_smiles):
"""
Train and test RF baseline model.
Parameters:
df : pd.DataFrame with 'SMILES' and 'Activity_label' columns
train_smiles : list of training set smiles
test_smiles : list of test set smiles
Returns:
preds : list of predicted labels for the test set
"""
train_df = df[df['SMILES'].isin(train_smiles)]
test_df = df[df['SMILES'].isin(test_smiles)]
# Generate Morgan fingerprints for training and test sets
X_train = generate_morgan_fingerprints(train_df['SMILES'])
X_test = generate_morgan_fingerprints(test_df['SMILES'])
# Extract labels
y_train = train_df['Activity'].values
# Train a Random Forest Classifier
clf = RandomForestClassifier(n_estimators=200, random_state=82)
clf.fit(X_train, y_train)
# Make predictions on the test set
try:
preds = clf.predict_proba(X_test)[:,1]
except Exception as e:
print(e)
print(test_df)
print(X_test)
return preds
def fh(smiles_list):
df = pd.read_csv('data/fh_predictions.csv')
preds = df[df['SMILES'].isin(smiles_list)]['Prediction'].tolist()
return preds
def drop_assays_with_limited_data(df, na_min=50, ni_min=100):
print('Drop assays with not enough datapoints...')
unique_uids = df['AID'].sort_values().unique() # Sorted unique targets
activity_counts = df.groupby('AID')['Activity'].value_counts().unstack().fillna(0) # matrix: rows=sorted targets, columns=nactive, ninactives
mask = ((activity_counts[1] >= na_min) & (activity_counts[0] >= ni_min) ) # Both nactives and ninactives above nmin
df = df[df['AID'].isin(unique_uids[mask])]
return df
def run(
n_actives : int,
n_inactives : int,
model : str = 'MHNfs',
task : str = 'UID',
input_file : str = '', # todo add path
output_dir : str = '', # todo add path
n_repeats : int = 3,
seed : int = 42
):
# Load data
data = pd.read_csv(input_file)
if task == 'AID':
data = drop_assays_with_limited_data(data, 30, 30)
# Output dir
output_dir = os.path.join(output_dir, model, task, f'{n_actives}+{n_inactives}x{n_repeats}')
print(output_dir)
os.makedirs(output_dir, exist_ok=True)
# Tasks
tasks = data[task].value_counts(ascending=True).index.tolist()
# print(tasks)
if model == 'MHNfs':
predictor = ActivityPredictor()
# Iterate over tasks
for t in tqdm(tasks):
# Output file
output_file = os.path.join(output_dir, f'{t}.csv')
if os.path.exists(output_file):
continue
# Data for task
df = data[data[task] == t]
# Iterate over replicates
results = []
for i in range(n_repeats):
# Select support sets and test molecules
actives = df.loc[df['Activity'] == 1, 'SMILES'].sample(n=n_actives, random_state=seed+i).tolist()
inactives = df.loc[df['Activity'] == 0, 'SMILES'].sample(n=n_inactives, random_state=seed+i).tolist()
test_smiles = df[~df.SMILES.isin(actives+inactives)].SMILES.tolist()
if model == 'RF':
preds = rf(df, actives+inactives, test_smiles)
else:
if len(test_smiles) > 10_000:
# MHNfs breaks for over 20_000 datapoints -> Use chunks to make predictions
chunk_size = 10_000
chunks = [test_smiles[i:i + chunk_size] for i in range(0, len(test_smiles), chunk_size)]
preds = []
for chunk in chunks:
preds.extend( predictor.predict(chunk, actives, inactives))
else:
preds = predictor.predict(test_smiles, actives, inactives)
d = {
'SMILES' : test_smiles,
'Label' : df[df.SMILES.isin(test_smiles)].Activity,
'Prediction' : preds,
'Fold' : [i] * len(test_smiles)
}
results.append(pd.DataFrame(d))
results = pd.concat(results)
results.to_csv(output_file, index=False)
if __name__ == '__main__':
mhnfs_path = # mhnfs_path + '/mhnfs'
benchmark_path = # project_path
import sys
sys.path.append(mhnfs_path)
from src.prediction_pipeline import ActivityPredictor
support_sets = [(1,7), (2,6), (4,4)]
models = ['RF', 'MHNfs']
tasks = ['AID', 'UID']
input_file = # preprocessed_data path + '/pubchem24_preprocessed_2.csv.gz'
for support_set in support_sets:
for model in models:
for task in tasks:
run(*support_set, task=task, model=model, input_file=input_file)