|
import json |
|
import pandas as pd |
|
import tqdm |
|
import swifter |
|
from rdkit import Chem |
|
|
|
|
|
from rdkit import RDLogger |
|
RDLogger.DisableLog('rdApp.*') |
|
|
|
PUBCHEM_DIR = |
|
FSMOL_UID_PATH = |
|
PROT_CLASS_PATH = |
|
MHNFS_PATH = |
|
|
|
import sys |
|
sys.path.append(MHNFS_PATH) |
|
from src.data_preprocessing.utils import Standardizer |
|
|
|
class PubChemFilter: |
|
|
|
def __init__(self, pubchem_dir, fsmol_uid_path, prot_class_path, mhnfs_path, debug = False): |
|
self.pubchem_dir = pubchem_dir |
|
self.fsmol_uid_path = fsmol_uid_path |
|
self.prot_class_path = prot_class_path |
|
self.mhnfs_path = mhnfs_path |
|
self.debug = debug |
|
|
|
def load_and_filter_assays(self): |
|
""" |
|
Load PubChem Assay data from file and filter them: |
|
1. Drop all assays without protein accession keys |
|
2. Drop all assays linked to multiple accession keys |
|
3. Drop all assays with accession keys in FSmol training data |
|
|
|
Returns: |
|
df_assays (pd.Dataframe) |
|
""" |
|
|
|
print('Load assays...') |
|
df_assays = pd.read_table(f'{self.pubchem_dir}/bioassays.tsv.gz', usecols=['AID', 'UniProts IDs'] ).rename(columns={'UniProts IDs' : 'UID'}) |
|
|
|
|
|
with open(self.fsmol_uid_path, 'r') as f: |
|
fs_train_targets = json.load(f).values() |
|
fs_train_targets = list(set([key for sublist in fs_train_targets for key in sublist])) |
|
|
|
print('Filter assays...') |
|
df_assays = df_assays.dropna(subset=['UID']) |
|
df_assays = df_assays[~df_assays['UID'].str.contains('\|')] |
|
df_assays = df_assays[~df_assays['UID'].str.contains('|'.join(fs_train_targets))] |
|
self.df_assays = df_assays |
|
|
|
def load_and_filter_bioactivities(self, chunk_size=10_000_000): |
|
""" |
|
Load bioactivity data in chucks and filter out datapoints with |
|
1. assay not in aids |
|
2. outcome not 'Active'/'Inactive' |
|
""" |
|
|
|
print('Load bioactivities...') |
|
aids = self.df_assays.AID.tolist() |
|
filtered_chunks = [] |
|
chunk_size = 10_000_000 |
|
for chunk in pd.read_csv(f'{self.pubchem_dir}/bioactivities.tsv.gz', sep='\t', chunksize=chunk_size, usecols=['AID', 'CID', 'Activity Outcome']): |
|
filtered_chunk = chunk[chunk['AID'].isin(aids)] |
|
filtered_chunk = filtered_chunk[filtered_chunk['Activity Outcome'].isin(['Inactive','Active'])] |
|
filtered_chunks.append(filtered_chunk) |
|
if self.debug: |
|
break |
|
df_bio = pd.concat(filtered_chunks) |
|
df_bio = df_bio[df_bio.CID.notna()] |
|
df_bio['Activity'] = df_bio['Activity Outcome'].swifter.apply(lambda x : 1 if x == 'Active' else 0) |
|
self.df_bio = df_bio.drop('Activity Outcome', axis=1).astype(int) |
|
|
|
def merge_assay_and_activity_data(self): |
|
print('Merge...') |
|
self.df = self.df_bio.merge(self.df_assays, on='AID', how='left') |
|
convert_dict = {col: 'int32' if col != 'UID' else 'str' for col in self.df.columns } |
|
self.df = self.df.astype(convert_dict) |
|
del self.df_assays, self.df_bio |
|
|
|
def drop_hts_assays(self): |
|
print('Drop HTS assays...') |
|
aid_counts = self.df.groupby('AID').size() |
|
filtered_aids = aid_counts[aid_counts <= 100_000].index |
|
self.df = self.df[self.df['AID'].isin(filtered_aids)] |
|
|
|
def drop_targets_with_limited_data(self, na_min=50, ni_min=50): |
|
print('Drop targets with not enough datapoints...') |
|
unique_uids = self.df['UID'].sort_values().unique() |
|
activity_counts = self.df.groupby('UID')['Activity'].value_counts().unstack().fillna(0) |
|
mask = ((activity_counts[1] >= na_min) & (activity_counts[0] >= ni_min) ) |
|
self.df = self.df[self.df['UID'].isin(unique_uids[mask])] |
|
|
|
def drop_conflicting_bioactivity_measures(self, target_col='UID', compound_col='CID'): |
|
""" |
|
Check if each target-compound pair is associated to an unique activity value, |
|
i.e. every measure either active or inactive. If not, drop it. |
|
""" |
|
|
|
def process_group(group): |
|
if group['Activity'].nunique() == 1: |
|
return group.head(1) |
|
else: |
|
return None |
|
|
|
print('Drop conflicting datapoints...') |
|
|
|
df_uniques = self.df.drop_duplicates(subset=[target_col, compound_col], keep=False) |
|
df_duplicates = self.df[~self.df.index.isin(df_uniques.index)] |
|
|
|
|
|
groups = df_duplicates.groupby([target_col, compound_col]) |
|
rows = [] |
|
for _, group in tqdm.tqdm(groups): |
|
rows.append(process_group(group)) |
|
df_rows = pd.concat([row for row in rows if row is not None]) |
|
self.df = pd.concat([df_uniques, df_rows]) |
|
|
|
def add_smiles(self, chunk_size=10_000_000): |
|
print('Retrieve SMILES...') |
|
cids = self.df.CID.astype(int).unique() |
|
filtered_chunks = [] |
|
for chunk in pd.read_table(f'{self.pubchem_dir}/smiles.tsv.gz', chunksize=chunk_size, names=['CID', 'SMILES']): |
|
filtered_chunk = chunk[chunk['CID'].isin(cids)] |
|
filtered_chunks.append(filtered_chunk) |
|
if self.debug: |
|
break |
|
df_smiles = pd.concat(filtered_chunks) |
|
|
|
def cleanup(smiles): |
|
sm = Standardizer(metal_disconnect=True, canon_taut=True) |
|
mol = Chem.MolFromSmiles(smiles) |
|
try: |
|
standardized_mol, _ = sm.standardize_mol(mol) |
|
return Chem.MolToSmiles(standardized_mol) |
|
except: |
|
print(smiles) |
|
return None |
|
|
|
df_smiles['SMILES'] = df_smiles['SMILES'].swifter.apply(lambda smi: cleanup(smi)) |
|
df_smiles.dropna(inplace=True) |
|
|
|
self.df = self.df.merge(df_smiles, on='CID', how='left').dropna(subset=['SMILES']) |
|
|
|
def print_stats(self): |
|
nassays = self.df['AID'].nunique() |
|
ntargets = self.df["UID"].nunique() |
|
ncompounds = self.df["CID"].nunique() |
|
nactvities = self.df.shape[0] |
|
print(f'{ntargets: >5,} targets | {nassays: >6,} assays | {ncompounds: >9,} compounds | {nactvities: >10,} activity data points') |
|
|
|
def save(self, fname='data/pubchem24_preprocessed.csv.gz'): |
|
print(f'Save to {fname}...') |
|
self.df.to_csv(fname, index=False) |
|
|
|
def load(self, fname): |
|
print(f'Load from {fname}...') |
|
self.df = pd.read_csv(fname) |
|
|
|
def add_protein_classifications(self): |
|
""" |
|
Retrieve protein classification |
|
""" |
|
print('Retrieve protein classifications...') |
|
protein_class = pd.read_csv(self.prot_class_path) |
|
print(protein_class) |
|
|
|
self.df = self.df.merge(protein_class[['UID', 'Organism', 'L1', 'L2']], on='UID', how='left') |
|
|
|
if __name__ == '__main__': |
|
|
|
pubchem_filter = PubChemFilter(PUBCHEM_DIR, FSMOL_UID_PATH, PROT_CLASS_PATH, MHNFS_PATH, False) |
|
|
|
|
|
pubchem_filter.load_and_filter_assays() |
|
pubchem_filter.load_and_filter_bioactivities() |
|
pubchem_filter.merge_assay_and_activity_data() |
|
pubchem_filter.print_stats() |
|
pubchem_filter.drop_hts_assays() |
|
pubchem_filter.print_stats() |
|
pubchem_filter.drop_targets_with_limited_data() |
|
pubchem_filter.print_stats() |
|
pubchem_filter.drop_conflicting_bioactivity_measures() |
|
pubchem_filter.print_stats() |
|
pubchem_filter.drop_targets_with_limited_data() |
|
pubchem_filter.print_stats() |
|
pubchem_filter.add_smiles() |
|
pubchem_filter.print_stats() |
|
pubchem_filter.drop_conflicting_bioactivity_measures(compound_col='SMILES') |
|
pubchem_filter.print_stats() |
|
pubchem_filter.drop_targets_with_limited_data() |
|
pubchem_filter.print_stats() |
|
pubchem_filter.add_protein_classifications() |
|
pubchem_filter.save(fname='data/pubchem24/preprocessed.csv.gz') |
|
|
|
|