|
import chunk |
|
import os |
|
import warnings |
|
|
|
import pandas as pd |
|
from rdkit import Chem |
|
|
|
from rdkit.Chem import rdFingerprintGenerator |
|
from sklearn.ensemble import RandomForestClassifier |
|
from tqdm.auto import tqdm |
|
import numpy as np |
|
import clamp |
|
import torch |
|
|
|
warnings.filterwarnings("ignore") |
|
|
|
|
|
def generate_morgan_fingerprints(smiles_list, radius=4, n_bits=4048): |
|
""" |
|
Generate Morgan fingerprints for a list of SMILES. |
|
""" |
|
mfpgen = rdFingerprintGenerator.GetMorganGenerator(radius=radius,fpSize=n_bits) |
|
mols = [Chem.MolFromSmiles(smi) for smi in smiles_list] |
|
fps = [] |
|
for smiles, mol in zip(smiles_list, mols): |
|
if mol is None: |
|
print(smiles) |
|
fps.append(None) |
|
else: |
|
fps.append(mfpgen.GetFingerprintAsNumPy(mol)) |
|
|
|
return fps |
|
|
|
def rf(df, train_smiles, test_smiles): |
|
""" |
|
Train and test RF baseline model. |
|
|
|
Parameters: |
|
df : pd.DataFrame with 'SMILES' and 'Activity_label' columns |
|
train_smiles : list of training set smiles |
|
test_smiles : list of test set smiles |
|
Returns: |
|
preds : list of predicted labels for the test set |
|
""" |
|
train_df = df[df['SMILES'].isin(train_smiles)] |
|
test_df = df[df['SMILES'].isin(test_smiles)] |
|
|
|
|
|
X_train = generate_morgan_fingerprints(train_df['SMILES']) |
|
X_test = generate_morgan_fingerprints(test_df['SMILES']) |
|
|
|
|
|
y_train = train_df['Activity'].values |
|
|
|
|
|
clf = RandomForestClassifier(n_estimators=200, random_state=82) |
|
clf.fit(X_train, y_train) |
|
|
|
|
|
try: |
|
preds = clf.predict_proba(X_test)[:,1] |
|
except Exception as e: |
|
print(e) |
|
print(test_df) |
|
print(X_test) |
|
|
|
return preds |
|
|
|
def fh(smiles_list): |
|
df = pd.read_csv('data/fh_predictions.csv') |
|
preds = df[df['SMILES'].isin(smiles_list)]['Prediction'].tolist() |
|
return preds |
|
|
|
def drop_assays_with_limited_data(df, na_min=50, ni_min=100): |
|
print('Drop assays with not enough datapoints...') |
|
unique_uids = df['AID'].sort_values().unique() |
|
activity_counts = df.groupby('AID')['Activity'].value_counts().unstack().fillna(0) |
|
mask = ((activity_counts[1] >= na_min) & (activity_counts[0] >= ni_min) ) |
|
df = df[df['AID'].isin(unique_uids[mask])] |
|
return df |
|
|
|
def run( |
|
n_actives : int, |
|
n_inactives : int, |
|
model : str = 'MHNfs', |
|
task : str = 'UID', |
|
input_file : str = '', |
|
output_dir : str = '', |
|
n_repeats : int = 3, |
|
seed : int = 42 |
|
): |
|
|
|
|
|
data = pd.read_csv(input_file) |
|
|
|
if task == 'AID': |
|
data = drop_assays_with_limited_data(data, 30, 30) |
|
|
|
|
|
output_dir = os.path.join(output_dir, model, task, f'{n_actives}+{n_inactives}x{n_repeats}') |
|
print(output_dir) |
|
os.makedirs(output_dir, exist_ok=True) |
|
|
|
|
|
tasks = data[task].value_counts(ascending=True).index.tolist() |
|
|
|
|
|
if model == 'MHNfs': |
|
predictor = ActivityPredictor() |
|
|
|
|
|
for t in tqdm(tasks): |
|
|
|
|
|
output_file = os.path.join(output_dir, f'{t}.csv') |
|
if os.path.exists(output_file): |
|
continue |
|
|
|
|
|
df = data[data[task] == t] |
|
|
|
|
|
results = [] |
|
for i in range(n_repeats): |
|
|
|
actives = df.loc[df['Activity'] == 1, 'SMILES'].sample(n=n_actives, random_state=seed+i).tolist() |
|
inactives = df.loc[df['Activity'] == 0, 'SMILES'].sample(n=n_inactives, random_state=seed+i).tolist() |
|
test_smiles = df[~df.SMILES.isin(actives+inactives)].SMILES.tolist() |
|
|
|
if model == 'RF': |
|
preds = rf(df, actives+inactives, test_smiles) |
|
else: |
|
if len(test_smiles) > 10_000: |
|
|
|
chunk_size = 10_000 |
|
chunks = [test_smiles[i:i + chunk_size] for i in range(0, len(test_smiles), chunk_size)] |
|
preds = [] |
|
for chunk in chunks: |
|
preds.extend( predictor.predict(chunk, actives, inactives)) |
|
else: |
|
preds = predictor.predict(test_smiles, actives, inactives) |
|
|
|
d = { |
|
'SMILES' : test_smiles, |
|
'Label' : df[df.SMILES.isin(test_smiles)].Activity, |
|
'Prediction' : preds, |
|
'Fold' : [i] * len(test_smiles) |
|
} |
|
results.append(pd.DataFrame(d)) |
|
|
|
results = pd.concat(results) |
|
results.to_csv(output_file, index=False) |
|
|
|
if __name__ == '__main__': |
|
|
|
mhnfs_path = |
|
benchmark_path = |
|
|
|
import sys |
|
sys.path.append(mhnfs_path) |
|
from src.prediction_pipeline import ActivityPredictor |
|
|
|
support_sets = [(1,7), (2,6), (4,4)] |
|
models = ['RF', 'MHNfs'] |
|
tasks = ['AID', 'UID'] |
|
|
|
input_file = |
|
|
|
for support_set in support_sets: |
|
for model in models: |
|
for task in tasks: |
|
run(*support_set, task=task, model=model, input_file=input_file) |