import chunk import os import warnings import pandas as pd from rdkit import Chem # from rdkit.Chem import AllChem from rdkit.Chem import rdFingerprintGenerator from sklearn.ensemble import RandomForestClassifier from tqdm.auto import tqdm import numpy as np import clamp import torch warnings.filterwarnings("ignore") def generate_morgan_fingerprints(smiles_list, radius=4, n_bits=4048): """ Generate Morgan fingerprints for a list of SMILES. """ mfpgen = rdFingerprintGenerator.GetMorganGenerator(radius=radius,fpSize=n_bits) mols = [Chem.MolFromSmiles(smi) for smi in smiles_list] fps = [] for smiles, mol in zip(smiles_list, mols): if mol is None: print(smiles) fps.append(None) else: fps.append(mfpgen.GetFingerprintAsNumPy(mol)) # np.array([mfpgen.GetFingerprintAsNumPy(mol) for mol in mols]) return fps def rf(df, train_smiles, test_smiles): """ Train and test RF baseline model. Parameters: df : pd.DataFrame with 'SMILES' and 'Activity_label' columns train_smiles : list of training set smiles test_smiles : list of test set smiles Returns: preds : list of predicted labels for the test set """ train_df = df[df['SMILES'].isin(train_smiles)] test_df = df[df['SMILES'].isin(test_smiles)] # Generate Morgan fingerprints for training and test sets X_train = generate_morgan_fingerprints(train_df['SMILES']) X_test = generate_morgan_fingerprints(test_df['SMILES']) # Extract labels y_train = train_df['Activity'].values # Train a Random Forest Classifier clf = RandomForestClassifier(n_estimators=200, random_state=82) clf.fit(X_train, y_train) # Make predictions on the test set try: preds = clf.predict_proba(X_test)[:,1] except Exception as e: print(e) print(test_df) print(X_test) return preds def fh(smiles_list): df = pd.read_csv('data/fh_predictions.csv') preds = df[df['SMILES'].isin(smiles_list)]['Prediction'].tolist() return preds def drop_assays_with_limited_data(df, na_min=50, ni_min=100): print('Drop assays with not enough datapoints...') unique_uids = df['AID'].sort_values().unique() # Sorted unique targets activity_counts = df.groupby('AID')['Activity'].value_counts().unstack().fillna(0) # matrix: rows=sorted targets, columns=nactive, ninactives mask = ((activity_counts[1] >= na_min) & (activity_counts[0] >= ni_min) ) # Both nactives and ninactives above nmin df = df[df['AID'].isin(unique_uids[mask])] return df def run( n_actives : int, n_inactives : int, model : str = 'MHNfs', task : str = 'UID', input_file : str = '', # todo add path output_dir : str = '', # todo add path n_repeats : int = 3, seed : int = 42 ): # Load data data = pd.read_csv(input_file) if task == 'AID': data = drop_assays_with_limited_data(data, 30, 30) # Output dir output_dir = os.path.join(output_dir, model, task, f'{n_actives}+{n_inactives}x{n_repeats}') print(output_dir) os.makedirs(output_dir, exist_ok=True) # Tasks tasks = data[task].value_counts(ascending=True).index.tolist() # print(tasks) if model == 'MHNfs': predictor = ActivityPredictor() # Iterate over tasks for t in tqdm(tasks): # Output file output_file = os.path.join(output_dir, f'{t}.csv') if os.path.exists(output_file): continue # Data for task df = data[data[task] == t] # Iterate over replicates results = [] for i in range(n_repeats): # Select support sets and test molecules actives = df.loc[df['Activity'] == 1, 'SMILES'].sample(n=n_actives, random_state=seed+i).tolist() inactives = df.loc[df['Activity'] == 0, 'SMILES'].sample(n=n_inactives, random_state=seed+i).tolist() test_smiles = df[~df.SMILES.isin(actives+inactives)].SMILES.tolist() if model == 'RF': preds = rf(df, actives+inactives, test_smiles) else: if len(test_smiles) > 10_000: # MHNfs breaks for over 20_000 datapoints -> Use chunks to make predictions chunk_size = 10_000 chunks = [test_smiles[i:i + chunk_size] for i in range(0, len(test_smiles), chunk_size)] preds = [] for chunk in chunks: preds.extend( predictor.predict(chunk, actives, inactives)) else: preds = predictor.predict(test_smiles, actives, inactives) d = { 'SMILES' : test_smiles, 'Label' : df[df.SMILES.isin(test_smiles)].Activity, 'Prediction' : preds, 'Fold' : [i] * len(test_smiles) } results.append(pd.DataFrame(d)) results = pd.concat(results) results.to_csv(output_file, index=False) if __name__ == '__main__': mhnfs_path = # mhnfs_path + '/mhnfs' benchmark_path = # project_path import sys sys.path.append(mhnfs_path) from src.prediction_pipeline import ActivityPredictor support_sets = [(1,7), (2,6), (4,4)] models = ['RF', 'MHNfs'] tasks = ['AID', 'UID'] input_file = # preprocessed_data path + '/pubchem24_preprocessed_2.csv.gz' for support_set in support_sets: for model in models: for task in tasks: run(*support_set, task=task, model=model, input_file=input_file)