File size: 8,445 Bytes
9afbc33
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
import json
import pandas as pd
import tqdm
import swifter
from rdkit import Chem

# Disable RDKit informational and warning messages
from rdkit import RDLogger
RDLogger.DisableLog('rdApp.*')

PUBCHEM_DIR = # pubchem_path + 'pubchem24/'
FSMOL_UID_PATH = # fsmol_path + '/fsmol/fsmol_train_accession_keys.json'
PROT_CLASS_PATH = # chembl_path + 'chembl33/uniprot_pclass_mapping.csv'
MHNFS_PATH = # mhnfs_path +  '/mhnfs'

import sys
sys.path.append(MHNFS_PATH)
from src.data_preprocessing.utils import Standardizer

class PubChemFilter:

    def __init__(self, pubchem_dir, fsmol_uid_path, prot_class_path, mhnfs_path, debug = False):
        self.pubchem_dir = pubchem_dir
        self.fsmol_uid_path = fsmol_uid_path
        self.prot_class_path = prot_class_path
        self.mhnfs_path = mhnfs_path
        self.debug = debug

    def load_and_filter_assays(self):
        """
        Load PubChem Assay data from file and filter them:
        1. Drop all assays without protein accession keys
        2. Drop all assays linked to multiple accession keys
        3. Drop all assays with accession keys in FSmol training data

        Returns:
            df_assays (pd.Dataframe)
        """

        print('Load assays...')
        df_assays = pd.read_table(f'{self.pubchem_dir}/bioassays.tsv.gz', usecols=['AID', 'UniProts IDs'] ).rename(columns={'UniProts IDs' : 'UID'})

        # Load FSmol training data accession keys
        with open(self.fsmol_uid_path, 'r') as f:
            fs_train_targets = json.load(f).values()
        fs_train_targets = list(set([key for sublist in fs_train_targets for key in sublist]))     

        print('Filter assays...')
        df_assays = df_assays.dropna(subset=['UID'])
        df_assays = df_assays[~df_assays['UID'].str.contains('\|')]
        df_assays = df_assays[~df_assays['UID'].str.contains('|'.join(fs_train_targets))] 
        self.df_assays = df_assays
    
    def load_and_filter_bioactivities(self, chunk_size=10_000_000):
        """
        Load bioactivity data in chucks and filter out datapoints with 
        1. assay not in aids
        2. outcome not 'Active'/'Inactive'
        """

        print('Load bioactivities...')
        aids = self.df_assays.AID.tolist()
        filtered_chunks = []
        chunk_size = 10_000_000  
        for chunk in pd.read_csv(f'{self.pubchem_dir}/bioactivities.tsv.gz', sep='\t', chunksize=chunk_size, usecols=['AID', 'CID', 'Activity Outcome']):
            filtered_chunk = chunk[chunk['AID'].isin(aids)]
            filtered_chunk = filtered_chunk[filtered_chunk['Activity Outcome'].isin(['Inactive','Active'])]
            filtered_chunks.append(filtered_chunk)
            if self.debug:
                break # For debugging
        df_bio = pd.concat(filtered_chunks)
        df_bio = df_bio[df_bio.CID.notna()]
        df_bio['Activity'] = df_bio['Activity Outcome'].swifter.apply(lambda x : 1 if x == 'Active' else 0)
        self.df_bio = df_bio.drop('Activity Outcome', axis=1).astype(int)

    def merge_assay_and_activity_data(self):
        print('Merge...')
        self.df = self.df_bio.merge(self.df_assays, on='AID', how='left')  
        convert_dict = {col: 'int32' if col != 'UID' else 'str' for col in self.df.columns }
        self.df = self.df.astype(convert_dict)
        del self.df_assays, self.df_bio

    def drop_hts_assays(self):
        print('Drop HTS assays...')
        aid_counts = self.df.groupby('AID').size()
        filtered_aids = aid_counts[aid_counts <= 100_000].index
        self.df = self.df[self.df['AID'].isin(filtered_aids)]

    def drop_targets_with_limited_data(self, na_min=50, ni_min=50):
        print('Drop targets with not enough datapoints...')
        unique_uids = self.df['UID'].sort_values().unique() # Sorted unique targets
        activity_counts = self.df.groupby('UID')['Activity'].value_counts().unstack().fillna(0) # matrix: rows=sorted targets, columns=nactive, ninactives
        mask = ((activity_counts[1] >= na_min) & (activity_counts[0] >= ni_min) ) # Both nactives and ninactives above nmin
        self.df = self.df[self.df['UID'].isin(unique_uids[mask])]

    def drop_conflicting_bioactivity_measures(self, target_col='UID', compound_col='CID'):
        """
        Check if each target-compound pair is associated to an unique activity value,
        i.e. every measure either active or inactive. If not, drop it.
        """

        def process_group(group):
            if group['Activity'].nunique() == 1:
                return group.head(1)
            else:
                return None
            
        print('Drop conflicting datapoints...')
        # Get unique UID-CID pairs and duplicated ones    
        df_uniques = self.df.drop_duplicates(subset=[target_col, compound_col], keep=False)
        df_duplicates = self.df[~self.df.index.isin(df_uniques.index)]
            
        # Check duplicated pairs
        groups = df_duplicates.groupby([target_col, compound_col])
        rows = []
        for _, group in tqdm.tqdm(groups):
            rows.append(process_group(group))
        df_rows = pd.concat([row for row in rows if row is not None])    
        self.df = pd.concat([df_uniques, df_rows])

    def add_smiles(self, chunk_size=10_000_000):
        print('Retrieve SMILES...')
        cids = self.df.CID.astype(int).unique()
        filtered_chunks = []
        for chunk in pd.read_table(f'{self.pubchem_dir}/smiles.tsv.gz', chunksize=chunk_size, names=['CID', 'SMILES']):
            filtered_chunk = chunk[chunk['CID'].isin(cids)]
            filtered_chunks.append(filtered_chunk)
            if self.debug:
                break
        df_smiles = pd.concat(filtered_chunks) 

        def cleanup(smiles):
            sm = Standardizer(metal_disconnect=True, canon_taut=True)
            mol = Chem.MolFromSmiles(smiles)
            try:
                standardized_mol, _ = sm.standardize_mol(mol)
                return Chem.MolToSmiles(standardized_mol)
            except:
                print(smiles)
                return None
    
        df_smiles['SMILES'] = df_smiles['SMILES'].swifter.apply(lambda smi: cleanup(smi))
        df_smiles.dropna(inplace=True)
        
        self.df = self.df.merge(df_smiles, on='CID', how='left').dropna(subset=['SMILES']) 

    def print_stats(self):
        nassays = self.df['AID'].nunique()
        ntargets = self.df["UID"].nunique()
        ncompounds = self.df["CID"].nunique()
        nactvities = self.df.shape[0]
        print(f'{ntargets: >5,} targets | {nassays: >6,} assays | {ncompounds: >9,} compounds | {nactvities: >10,} activity data points')

    def save(self, fname='data/pubchem24_preprocessed.csv.gz'):
        print(f'Save to {fname}...')
        self.df.to_csv(fname, index=False)

    def load(self, fname):
        print(f'Load from {fname}...')
        self.df = pd.read_csv(fname)

    def add_protein_classifications(self):
        """
        Retrieve protein classification
        """
        print('Retrieve protein classifications...')
        protein_class = pd.read_csv(self.prot_class_path)
        print(protein_class)
        # protein_class['UID'] = protein_class['target_id'].swifter.apply(lambda x: x.split('_')[0])
        self.df = self.df.merge(protein_class[['UID', 'Organism', 'L1', 'L2']], on='UID', how='left')    

if __name__ == '__main__':
    # Create an instance of PubChemFilter class
    pubchem_filter = PubChemFilter(PUBCHEM_DIR, FSMOL_UID_PATH, PROT_CLASS_PATH, MHNFS_PATH, False)

    # Call methods of the class as needed
    pubchem_filter.load_and_filter_assays()
    pubchem_filter.load_and_filter_bioactivities()
    pubchem_filter.merge_assay_and_activity_data()
    pubchem_filter.print_stats()
    pubchem_filter.drop_hts_assays()
    pubchem_filter.print_stats()
    pubchem_filter.drop_targets_with_limited_data()
    pubchem_filter.print_stats()
    pubchem_filter.drop_conflicting_bioactivity_measures()
    pubchem_filter.print_stats()
    pubchem_filter.drop_targets_with_limited_data()
    pubchem_filter.print_stats()
    pubchem_filter.add_smiles()
    pubchem_filter.print_stats()
    pubchem_filter.drop_conflicting_bioactivity_measures(compound_col='SMILES')
    pubchem_filter.print_stats()
    pubchem_filter.drop_targets_with_limited_data()
    pubchem_filter.print_stats()
    pubchem_filter.add_protein_classifications()
    pubchem_filter.save(fname='data/pubchem24/preprocessed.csv.gz')