DeepSEQreen_fast_build

Running on CPU Upgrade

App Files Files Community

libokj commited on Apr 21, 2024

Commit

db33be0

verified ·

1 Parent(s): 15b93be

Update app.py

Browse files

Files changed (1) hide show

app.py +124 -36

app.py CHANGED Viewed

@@ -278,36 +278,70 @@ def check_expiry():
 def max_tanimoto_similarity(smi, seen_smiles_with_fp):
-    if smi is None:
-        return 0
     if smi in seen_smiles_with_fp['X1'].values:
-        return 1
     mol = Chem.MolFromSmiles(smi)
     if mol is None:
-        return 0
     mol_ecfp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048)
     sims = BulkTanimotoSimilarity(mol_ecfp, seen_smiles_with_fp['FP'])
-    return max(sims)
 def max_sequence_identity(seq, seen_fastas):
-    if seq is None:
-        return 0
-    if seq in seen_fastas:
-        return 1
     aligner = PairwiseAligner()
     aligner.mode = 'local'
-    max_id = 0
-    for fasta in seen_fastas:
         alignment = aligner.align(seq, fasta)
         identity = alignment.score / max(len(seq), len(fasta))
         if identity == 1:
-            return 1
-        max_id = max(identity, max_id)
-    return max_id
-@cache
 def get_seen_smiles(family, task):
     if family == 'General':
         family = 'all_families_full'
@@ -318,7 +352,6 @@ def get_seen_smiles(family, task):
     return seen_smiles
-@cache
 def get_seen_fastas(family, task):
     if family == 'General':
         family = 'all_families_full'
@@ -717,7 +750,8 @@ def submit_predict(predict_filepath, task, preset, target_family, opts, job_info
             return row['Target Family']
         # If no exact match, then calculate alignment score
         else:
-            aligner = PairwiseAligner(mode='local')
             def align_score(target):
                 alignment = aligner.align(query, target)
@@ -827,24 +861,62 @@ def submit_predict(predict_filepath, task, preset, target_family, opts, job_info
         prediction_df = prediction_df.merge(orig_df, on=['X1', 'X2'], how='left', indicator=False)
         prediction_df = pd.concat([prediction_df, annotated_df], ignore_index=True)
-        if "Include Max. Tanimoto Similarity" in opts:
             for family in prediction_df['Target Family'].unique():
                 family_smiles_df = get_seen_smiles(family=family, task=task_value)
                 family_smiles_df['FP'] = family_smiles_df['X1'].parallel_apply(
                     lambda smiles: AllChem.GetMorganFingerprintAsBitVect(
                         Chem.MolFromSmiles(smiles), radius=2, nBits=2048)
                 )
-                max_sim = cache(partial(max_tanimoto_similarity, seen_smiles_with_fp=family_smiles_df))
-                prediction_df.loc[prediction_df['Target Family'] == family, 'Max. Tanimoto Similarity'] = (
                     prediction_df.loc[prediction_df['Target Family'] == family, 'X1'].parallel_apply(max_sim)
                 )
                 max_sim.cache_clear()
-        if "Include Max. Sequence Identity" in opts:
             for family in prediction_df['Target Family'].unique():
                 family_fastas_df = get_seen_fastas(family=family, task=task_value)
-                max_id = cache(partial(max_sequence_identity, seen_fastas=family_fastas_df['X2'].values))
-                prediction_df.loc[prediction_df['Target Family'] == family, 'Max. Sequence Identity'] = (
                     prediction_df.loc[prediction_df['Target Family'] == family, 'X2'].parallel_apply(max_id)
                 )
                 max_id.cache_clear()
@@ -1499,10 +1571,18 @@ with gr.Blocks(theme=theme, title='DeepSEQreen', css=CSS, delete_cache=(3600, 48
                     drug_library_upload = gr.File(label='Custom compound library file', visible=False)
                 drug_screen_opts = gr.CheckboxGroup(
-                    ['Include Max. Tanimoto Similarity'],
-                    label='Step 6. Select Additional Options',
-                    info="Calculating the maximum Tanimoto similarity of the library compounds to the "
-                         "training dataset is an experimental feature and may take a considerable amount of time."
                 )
             with gr.Row():
                 with gr.Column():
@@ -1516,7 +1596,6 @@ with gr.Blocks(theme=theme, title='DeepSEQreen', css=CSS, delete_cache=(3600, 48
                 with gr.Row():
                     drug_screen_clr_btn = gr.ClearButton(size='lg')
                     drug_screen_btn = gr.Button(value='SUBMIT THE SCREENING JOB', variant='primary', size='lg')
-        # TODO Modify the pd df directly with df['X2'] = target
         screen_data_for_predict = gr.File(visible=False, file_count="single", type='filepath')
@@ -1606,10 +1685,11 @@ with gr.Blocks(theme=theme, title='DeepSEQreen', css=CSS, delete_cache=(3600, 48
                         target_library_upload = gr.File(label='Custom target library file', visible=False)
                     target_identify_opts = gr.CheckboxGroup(
-                        ['Include Max. Sequence Identity'],
                         label='Step 6. Select Additional Options',
-                        info="Calculating the maximum sequence identity of the library protein to the "
-                             "training dataset is an experimental feature and may take a considerable amount of time."
                     )
                 with gr.Row():
                     with gr.Column():
@@ -1934,6 +2014,14 @@ with gr.Blocks(theme=theme, title='DeepSEQreen', css=CSS, delete_cache=(3600, 48
         x.name, gr.Dropdown(value=Path(x.name).name, choices=list(DRUG_LIBRARY_MAP.keys()) + [Path(x.name).name])
     ], inputs=drug_library_upload_btn, outputs=[drug_library_upload, drug_library])
     def example_fill(input_type):
         return {target_id: 'Q16539',
@@ -2558,14 +2646,14 @@ QALAHAYFAQYHDPDDEPVADPYDQSFESRDLLIDEWKSLTYDEVISFVPPPLDQEEMES
 if __name__ == "__main__":
     pandarallel.initialize()
     hydra.initialize(version_base="1.3", config_path="configs", job_name="webserver_inference")
     session = requests.Session()
     ADAPTER = HTTPAdapter(max_retries=Retry(total=5, backoff_factor=0.1, status_forcelist=[500, 502, 503, 504]))
     session.mount('http://', ADAPTER)
     session.mount('https://', ADAPTER)
     db = TinyDB(f'{SERVER_DATA_DIR}/db.json')
     # Set all RUNNING jobs to FAILED at TinyDB initialization
     Job = Query()
@@ -2573,9 +2661,9 @@ if __name__ == "__main__":
     for job in jobs:
         if job['status'] == 'RUNNING':
             db.update({'status': 'FAILED'}, Job.id == job['id'])
     scheduler = BackgroundScheduler()
     scheduler.add_job(check_expiry, 'interval', hours=1)
     scheduler.start()
     demo.queue(default_concurrency_limit=None, max_size=10).launch(show_api=False)

 def max_tanimoto_similarity(smi, seen_smiles_with_fp):
+    if smi is None or seen_smiles_with_fp is None or seen_smiles_with_fp.empty:
+        return {'Max. Tanimoto Similarity': 0, 'Max. Tanimoto Similarity Compound': None}
     if smi in seen_smiles_with_fp['X1'].values:
+        compound = smi
+        if 'ID1' in seen_smiles_with_fp.columns:
+            id1 = seen_smiles_with_fp.loc[seen_smiles_with_fp['X1'] == smi, 'ID1'].values[0]
+            if pd.notnull(id1) and id1 != '':
+                compound = id1
+        return {'Max. Tanimoto Similarity': 1, 'Max. Tanimoto Similarity Compound': compound}
     mol = Chem.MolFromSmiles(smi)
     if mol is None:
+        return {'Max. Tanimoto Similarity': 0, 'Max. Tanimoto Similarity Compound': None}
     mol_ecfp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048)
     sims = BulkTanimotoSimilarity(mol_ecfp, seen_smiles_with_fp['FP'])
+    idx = sims.argmax()
+    compound = seen_smiles_with_fp.iloc[idx]['X1']
+    if 'ID1' in seen_smiles_with_fp.columns:
+        id1 = seen_smiles_with_fp.iloc[idx]['ID1']
+        if pd.notnull(id1) and id1 != '':
+            compound = id1
+    return {'Max. Tanimoto Similarity': sims[idx], 'Max. Tanimoto Similarity Compound': compound}
 def max_sequence_identity(seq, seen_fastas):
+    if seq is None or seen_fastas is None or seen_fastas.empty:
+        return {'Max. Sequence Identity': 0, 'Max. Sequence Identity Target': None}
+    if seq in seen_fastas['X2'].values:
+        target = seq
+        if 'ID2' in seen_fastas.columns:
+            id2 = seen_fastas.loc[seen_fastas['X2'] == seq, 'ID2'].values[0]
+            if pd.notnull(id2) and id2 != '':
+                target = id2
+        return {'Max. Sequence Identity': 1, 'Max. Sequence Identity Target': target}
     aligner = PairwiseAligner()
     aligner.mode = 'local'
+    max_iden = 0
+    target = None
+    for fasta in seen_fastas['X2'].values:
         alignment = aligner.align(seq, fasta)
         identity = alignment.score / max(len(seq), len(fasta))
         if identity == 1:
+            target = fasta
+            if 'ID2' in seen_fastas.columns:
+                id2 = seen_fastas.loc[seen_fastas['X2'] == fasta, 'ID2'].values[0]
+                if pd.notnull(id2) and id2 != '':
+                    target = id2
+            return {'Max. Sequence Identity': 1, 'Max. Sequence Identity Target': target}
+        if identity > max_iden:
+            max_iden = identity
+            target = fasta
+            if 'ID2' in seen_fastas.columns:
+                id2 = seen_fastas.loc[seen_fastas['X2'] == fasta, 'ID2'].values[0]
+                if pd.notnull(id2) and id2 != '':
+                    target = id2
+    return {'Max. Sequence Identity': max_iden, 'Max. Sequence Identity Target': target}
 def get_seen_smiles(family, task):
     if family == 'General':
         family = 'all_families_full'
     return seen_smiles
 def get_seen_fastas(family, task):
     if family == 'General':
         family = 'all_families_full'
             return row['Target Family']
         # If no exact match, then calculate alignment score
         else:
+            aligner = PairwiseAligner()
+            aligner.mode = 'local'
             def align_score(target):
                 alignment = aligner.align(query, target)
         prediction_df = prediction_df.merge(orig_df, on=['X1', 'X2'], how='left', indicator=False)
         prediction_df = pd.concat([prediction_df, annotated_df], ignore_index=True)
+        if "Include Compound Max. Tanimoto Similarity to Training Compounds" in opts:
             for family in prediction_df['Target Family'].unique():
                 family_smiles_df = get_seen_smiles(family=family, task=task_value)
                 family_smiles_df['FP'] = family_smiles_df['X1'].parallel_apply(
                     lambda smiles: AllChem.GetMorganFingerprintAsBitVect(
                         Chem.MolFromSmiles(smiles), radius=2, nBits=2048)
                 )
+                @cache
+                def max_sim(smi):
+                    return max_tanimoto_similarity(smi, family_smiles_df)['Max. Tanimoto Similarity']
+                prediction_df.loc[
+                    prediction_df['Target Family'] == family, 'Max. Tanimoto Similarity to Training Compounds'] = (
                     prediction_df.loc[prediction_df['Target Family'] == family, 'X1'].parallel_apply(max_sim)
                 )
                 max_sim.cache_clear()
+        if "Include Compound Max. Tanimoto Similarity to Known Interacting Compounds of Target" in opts:
+            x2 = prediction_df['X2'].iloc[0]
+            pos_compounds_df = df_training.loc[(df_training['X2'] == x2) & (df_training['Y'] == 1)]
+            pos_compounds_df['FP'] = pos_compounds_df['X1'].parallel_apply(
+                lambda smiles: AllChem.GetMorganFingerprintAsBitVect(
+                    Chem.MolFromSmiles(smiles), radius=2, nBits=2048)
+            )
+            max_sim = cache(partial(max_tanimoto_similarity, seen_smiles_with_fp=pos_compounds_df))
+            prediction_df[['Max. Tanimoto Similarity', 'Max. Tanimoto Similarity Compound']] = (
+                prediction_df['X1'].parallel_apply(max_sim).apply(pd.Series)
+            )
+            max_sim.cache_clear()
+        if "Include Target Max. Sequence Identity to Known Interacting Targets of Compound" in opts:
+            x2 = prediction_df['X2'].iloc[0]
+            prediction_df['X1^'] = prediction_df['X1'].parallel_apply(rdkit_canonicalize)
+            @cache
+            def calculate_max_sequence_identity(compound):
+                compound_targets = df_training.loc[df_training['X1'] == compound]
+                return max_sequence_identity(x2, seen_fastas=compound_targets)
+            prediction_df[['Max. Sequence Identity', 'Max. Sequence Identity Target']] = (
+                prediction_df['X1^'].parallel_apply(calculate_max_sequence_identity).apply(pd.Series)
+            )
+            calculate_max_sequence_identity.cache_clear()
+        if "Include Target Max. Sequence Identity to Training Targets" in opts:
             for family in prediction_df['Target Family'].unique():
                 family_fastas_df = get_seen_fastas(family=family, task=task_value)
+                @cache
+                def max_id(seq):
+                    return max_sequence_identity(seq, seen_fastas=family_fastas_df)['Max. Sequence Identity']
+                prediction_df.loc[
+                    prediction_df['Target Family'] == family, 'Max. Sequence Identity to Training Targets'] = (
                     prediction_df.loc[prediction_df['Target Family'] == family, 'X2'].parallel_apply(max_id)
                 )
                 max_id.cache_clear()
                     drug_library_upload = gr.File(label='Custom compound library file', visible=False)
                 drug_screen_opts = gr.CheckboxGroup(
+                    label="Step 6. Select Additional Options",
+                    choices=[
+                        'Include Compound Max. Tanimoto Similarity to Training Compounds',
+                        'Include Target Max. Sequence Identity to Known Interacting Targets of Compound',
+                        'Include Compound Max. Tanimoto Similarity to Known Interacting Compounds of Target',
+                    ],
+                    info="These are experimental features and may increase the job computation time. "
+                         "Compound Max. Tanimoto Similarity to Training Compounds and "
+                         "Target Max. Sequence Identity to Known Interacting Targets of Compound "
+                         "are indicative of the predictive reliability of the model (the higher the more reliable), "
+                         "while Compound Max. Tanimoto Similarity to Known Interacting Compounds of Target "
+                         "is indicative of the novelty of the compound (the lower the more novel)."
                 )
             with gr.Row():
                 with gr.Column():
                 with gr.Row():
                     drug_screen_clr_btn = gr.ClearButton(size='lg')
                     drug_screen_btn = gr.Button(value='SUBMIT THE SCREENING JOB', variant='primary', size='lg')
         screen_data_for_predict = gr.File(visible=False, file_count="single", type='filepath')
                         target_library_upload = gr.File(label='Custom target library file', visible=False)
                     target_identify_opts = gr.CheckboxGroup(
+                        ['Include Target Max. Sequence Identity to Training Targets'],
                         label='Step 6. Select Additional Options',
+                        info="These are experimental features and may increase the job computation time. "
+                             "Target Max. Sequence Identity to Training Targets is indicative of the "
+                             "predictive reliability of the model (the higher the more reliable)."
                     )
                 with gr.Row():
                     with gr.Column():
         x.name, gr.Dropdown(value=Path(x.name).name, choices=list(DRUG_LIBRARY_MAP.keys()) + [Path(x.name).name])
     ], inputs=drug_library_upload_btn, outputs=[drug_library_upload, drug_library])
+    drug_screen_task.select(
+        fn=lambda task, opts: [opt for opt in opts if opt not in [
+            'Include Compound Max. Tanimoto Similarity to Known Interacting Compounds of Target',
+            'Include Target Max. Sequence Identity to Known Interacting Targets of Compound'
+        ]] if task == 'Compound-Protein Binding Affinity' else opts,
+        inputs=[drug_screen_task, drug_screen_opts], outputs=drug_screen_opts,
+        show_progress='hidden'
+    )
     def example_fill(input_type):
         return {target_id: 'Q16539',
 if __name__ == "__main__":
     pandarallel.initialize()
     hydra.initialize(version_base="1.3", config_path="configs", job_name="webserver_inference")
     session = requests.Session()
     ADAPTER = HTTPAdapter(max_retries=Retry(total=5, backoff_factor=0.1, status_forcelist=[500, 502, 503, 504]))
     session.mount('http://', ADAPTER)
     session.mount('https://', ADAPTER)
     db = TinyDB(f'{SERVER_DATA_DIR}/db.json')
     # Set all RUNNING jobs to FAILED at TinyDB initialization
     Job = Query()
     for job in jobs:
         if job['status'] == 'RUNNING':
             db.update({'status': 'FAILED'}, Job.id == job['id'])
     scheduler = BackgroundScheduler()
     scheduler.add_job(check_expiry, 'interval', hours=1)
     scheduler.start()
     demo.queue(default_concurrency_limit=None, max_size=10).launch(show_api=False)