Spaces:

ml-jku
/

mhnfs

Running

App Files Files Community

mhnfs / pubchem_experiment /data_preprocess.py

Tschoui

Upload 3 files

9afbc33 verified 7 months ago

raw

history blame contribute delete

8.45 kB

	import json
	import pandas as pd
	import tqdm
	import swifter
	from rdkit import Chem

	# Disable RDKit informational and warning messages
	from rdkit import RDLogger
	RDLogger.DisableLog('rdApp.*')

	PUBCHEM_DIR = # pubchem_path + 'pubchem24/'
	FSMOL_UID_PATH = # fsmol_path + '/fsmol/fsmol_train_accession_keys.json'
	PROT_CLASS_PATH = # chembl_path + 'chembl33/uniprot_pclass_mapping.csv'
	MHNFS_PATH = # mhnfs_path + '/mhnfs'

	import sys
	sys.path.append(MHNFS_PATH)
	from src.data_preprocessing.utils import Standardizer

	class PubChemFilter:

	def __init__(self, pubchem_dir, fsmol_uid_path, prot_class_path, mhnfs_path, debug = False):
	self.pubchem_dir = pubchem_dir
	self.fsmol_uid_path = fsmol_uid_path
	self.prot_class_path = prot_class_path
	self.mhnfs_path = mhnfs_path
	self.debug = debug

	def load_and_filter_assays(self):
	"""
	Load PubChem Assay data from file and filter them:
	1. Drop all assays without protein accession keys
	2. Drop all assays linked to multiple accession keys
	3. Drop all assays with accession keys in FSmol training data

	Returns:
	df_assays (pd.Dataframe)
	"""

	print('Load assays...')
	df_assays = pd.read_table(f'{self.pubchem_dir}/bioassays.tsv.gz', usecols=['AID', 'UniProts IDs'] ).rename(columns={'UniProts IDs' : 'UID'})

	# Load FSmol training data accession keys
	with open(self.fsmol_uid_path, 'r') as f:
	fs_train_targets = json.load(f).values()
	fs_train_targets = list(set([key for sublist in fs_train_targets for key in sublist]))

	print('Filter assays...')
	df_assays = df_assays.dropna(subset=['UID'])
	df_assays = df_assays[~df_assays['UID'].str.contains('\\|')]
	df_assays = df_assays[~df_assays['UID'].str.contains('\|'.join(fs_train_targets))]
	self.df_assays = df_assays

	def load_and_filter_bioactivities(self, chunk_size=10_000_000):
	"""
	Load bioactivity data in chucks and filter out datapoints with
	1. assay not in aids
	2. outcome not 'Active'/'Inactive'
	"""

	print('Load bioactivities...')
	aids = self.df_assays.AID.tolist()
	filtered_chunks = []
	chunk_size = 10_000_000
	for chunk in pd.read_csv(f'{self.pubchem_dir}/bioactivities.tsv.gz', sep='\t', chunksize=chunk_size, usecols=['AID', 'CID', 'Activity Outcome']):
	filtered_chunk = chunk[chunk['AID'].isin(aids)]
	filtered_chunk = filtered_chunk[filtered_chunk['Activity Outcome'].isin(['Inactive','Active'])]
	filtered_chunks.append(filtered_chunk)
	if self.debug:
	break # For debugging
	df_bio = pd.concat(filtered_chunks)
	df_bio = df_bio[df_bio.CID.notna()]
	df_bio['Activity'] = df_bio['Activity Outcome'].swifter.apply(lambda x : 1 if x == 'Active' else 0)
	self.df_bio = df_bio.drop('Activity Outcome', axis=1).astype(int)

	def merge_assay_and_activity_data(self):
	print('Merge...')
	self.df = self.df_bio.merge(self.df_assays, on='AID', how='left')
	convert_dict = {col: 'int32' if col != 'UID' else 'str' for col in self.df.columns }
	self.df = self.df.astype(convert_dict)
	del self.df_assays, self.df_bio

	def drop_hts_assays(self):
	print('Drop HTS assays...')
	aid_counts = self.df.groupby('AID').size()
	filtered_aids = aid_counts[aid_counts <= 100_000].index
	self.df = self.df[self.df['AID'].isin(filtered_aids)]

	def drop_targets_with_limited_data(self, na_min=50, ni_min=50):
	print('Drop targets with not enough datapoints...')
	unique_uids = self.df['UID'].sort_values().unique() # Sorted unique targets
	activity_counts = self.df.groupby('UID')['Activity'].value_counts().unstack().fillna(0) # matrix: rows=sorted targets, columns=nactive, ninactives
	mask = ((activity_counts[1] >= na_min) & (activity_counts[0] >= ni_min) ) # Both nactives and ninactives above nmin
	self.df = self.df[self.df['UID'].isin(unique_uids[mask])]

	def drop_conflicting_bioactivity_measures(self, target_col='UID', compound_col='CID'):
	"""
	Check if each target-compound pair is associated to an unique activity value,
	i.e. every measure either active or inactive. If not, drop it.
	"""

	def process_group(group):
	if group['Activity'].nunique() == 1:
	return group.head(1)
	else:
	return None

	print('Drop conflicting datapoints...')
	# Get unique UID-CID pairs and duplicated ones
	df_uniques = self.df.drop_duplicates(subset=[target_col, compound_col], keep=False)
	df_duplicates = self.df[~self.df.index.isin(df_uniques.index)]

	# Check duplicated pairs
	groups = df_duplicates.groupby([target_col, compound_col])
	rows = []
	for _, group in tqdm.tqdm(groups):
	rows.append(process_group(group))
	df_rows = pd.concat([row for row in rows if row is not None])
	self.df = pd.concat([df_uniques, df_rows])

	def add_smiles(self, chunk_size=10_000_000):
	print('Retrieve SMILES...')
	cids = self.df.CID.astype(int).unique()
	filtered_chunks = []
	for chunk in pd.read_table(f'{self.pubchem_dir}/smiles.tsv.gz', chunksize=chunk_size, names=['CID', 'SMILES']):
	filtered_chunk = chunk[chunk['CID'].isin(cids)]
	filtered_chunks.append(filtered_chunk)
	if self.debug:
	break
	df_smiles = pd.concat(filtered_chunks)

	def cleanup(smiles):
	sm = Standardizer(metal_disconnect=True, canon_taut=True)
	mol = Chem.MolFromSmiles(smiles)
	try:
	standardized_mol, _ = sm.standardize_mol(mol)
	return Chem.MolToSmiles(standardized_mol)
	except:
	print(smiles)
	return None

	df_smiles['SMILES'] = df_smiles['SMILES'].swifter.apply(lambda smi: cleanup(smi))
	df_smiles.dropna(inplace=True)

	self.df = self.df.merge(df_smiles, on='CID', how='left').dropna(subset=['SMILES'])

	def print_stats(self):
	nassays = self.df['AID'].nunique()
	ntargets = self.df["UID"].nunique()
	ncompounds = self.df["CID"].nunique()
	nactvities = self.df.shape[0]
	print(f'{ntargets: >5,} targets \| {nassays: >6,} assays \| {ncompounds: >9,} compounds \| {nactvities: >10,} activity data points')

	def save(self, fname='data/pubchem24_preprocessed.csv.gz'):
	print(f'Save to {fname}...')
	self.df.to_csv(fname, index=False)

	def load(self, fname):
	print(f'Load from {fname}...')
	self.df = pd.read_csv(fname)

	def add_protein_classifications(self):
	"""
	Retrieve protein classification
	"""
	print('Retrieve protein classifications...')
	protein_class = pd.read_csv(self.prot_class_path)
	print(protein_class)
	# protein_class['UID'] = protein_class['target_id'].swifter.apply(lambda x: x.split('_')[0])
	self.df = self.df.merge(protein_class[['UID', 'Organism', 'L1', 'L2']], on='UID', how='left')

	if __name__ == '__main__':
	# Create an instance of PubChemFilter class
	pubchem_filter = PubChemFilter(PUBCHEM_DIR, FSMOL_UID_PATH, PROT_CLASS_PATH, MHNFS_PATH, False)

	# Call methods of the class as needed
	pubchem_filter.load_and_filter_assays()
	pubchem_filter.load_and_filter_bioactivities()
	pubchem_filter.merge_assay_and_activity_data()
	pubchem_filter.print_stats()
	pubchem_filter.drop_hts_assays()
	pubchem_filter.print_stats()
	pubchem_filter.drop_targets_with_limited_data()
	pubchem_filter.print_stats()
	pubchem_filter.drop_conflicting_bioactivity_measures()
	pubchem_filter.print_stats()
	pubchem_filter.drop_targets_with_limited_data()
	pubchem_filter.print_stats()
	pubchem_filter.add_smiles()
	pubchem_filter.print_stats()
	pubchem_filter.drop_conflicting_bioactivity_measures(compound_col='SMILES')
	pubchem_filter.print_stats()
	pubchem_filter.drop_targets_with_limited_data()
	pubchem_filter.print_stats()
	pubchem_filter.add_protein_classifications()
	pubchem_filter.save(fname='data/pubchem24/preprocessed.csv.gz')