DeepSEQreen_fast_build

Running on CPU Upgrade

App Files Files Community

libokj commited on Apr 22, 2024

Commit

0dd77ac

verified ·

1 Parent(s): 264757c

Update app.py

Browse files

Files changed (1) hide show

app.py +27 -23

app.py CHANGED Viewed

@@ -43,7 +43,7 @@ import panel as pn
 from apscheduler.schedulers.background import BackgroundScheduler
 from tinydb import TinyDB, Query
-import swifter
 from tqdm.auto import tqdm
 from deepscreen.data.dti import validate_seq_str, rdkit_canonicalize, FASTA_PAT, SMILES_PAT
@@ -786,7 +786,7 @@ def submit_predict(predict_filepath, task, preset, target_family, opts, job_info
         orig_df['Target Family'] = None
     if orig_df['Target Family'].isna().any():
         orig_df.loc[orig_df['Target Family'].isna(), 'Target Family'] = (
-            orig_df.loc[orig_df['Target Family'].isna(), 'X2'].swifter.apply(detect_family)
         )
     detect_family.cache_clear()
@@ -885,7 +885,7 @@ def submit_predict(predict_filepath, task, preset, target_family, opts, job_info
         if "Include Compound Max. Tanimoto Similarity to Training Compounds" in opts:
             for family in prediction_df['Target Family'].unique():
                 family_smiles_df = get_seen_smiles(family=family, task=task_value)
-                family_smiles_df['FP'] = family_smiles_df['X1'].swifter.apply(smiles_to_ecfp)
                 @cache
                 def max_sim(smi):
@@ -893,7 +893,7 @@ def submit_predict(predict_filepath, task, preset, target_family, opts, job_info
                 prediction_df.loc[
                     prediction_df['Target Family'] == family, 'Max. Tanimoto Similarity to Training Compounds'] = (
-                    prediction_df.loc[prediction_df['Target Family'] == family, 'X1'].swifter.apply(max_sim)
                 )
                 max_sim.cache_clear()
@@ -907,13 +907,13 @@ def submit_predict(predict_filepath, task, preset, target_family, opts, job_info
                 return max_tanimoto_similarity(smiles, seen_smiles_with_fp=pos_compounds_df)
             prediction_df[['Max. Tanimoto Similarity', 'Max. Tanimoto Similarity Compound']] = (
-                prediction_df['X1'].swifter.apply(max_sim).apply(pd.Series)
             )
             max_sim.cache_clear()
         if "Include Target Max. Sequence Identity to Known Interacting Targets of Compound" in opts:
             x2 = prediction_df['X2'].iloc[0]
-            prediction_df['X1^'] = prediction_df['X1'].swifter.apply(rdkit_canonicalize)
             @cache
             def calculate_max_sequence_identity(compound):
@@ -921,7 +921,7 @@ def submit_predict(predict_filepath, task, preset, target_family, opts, job_info
                 return max_sequence_identity(x2, seen_fastas=compound_targets)
             prediction_df[['Max. Sequence Identity', 'Max. Sequence Identity Target']] = (
-                prediction_df['X1^'].swifter.apply(calculate_max_sequence_identity).apply(pd.Series)
             )
             prediction_df.drop(['X1^'], axis=1, inplace=True)
@@ -937,7 +937,7 @@ def submit_predict(predict_filepath, task, preset, target_family, opts, job_info
                 prediction_df.loc[
                     prediction_df['Target Family'] == family, 'Max. Sequence Identity to Training Targets'] = (
-                    prediction_df.loc[prediction_df['Target Family'] == family, 'X2'].swifter.apply(max_id)
                 )
                 max_id.cache_clear()
@@ -991,10 +991,10 @@ def update_df(file, progress=gr.Progress(track_tqdm=True)):
         if 'X1' in df.columns:
             if 'Compound' not in df.columns or df['Compound'].dtype != 'object':
-                df['Compound'] = df['X1'].swifter.apply(
                     lambda smiles: PandasTools._MolPlusFingerprint(Chem.MolFromSmiles(smiles)))
-            df['Scaffold'] = df['Compound'].swifter.apply(MurckoScaffold.GetScaffoldForMol)
-            df['Scaffold SMILES'] = df['Scaffold'].swifter.apply(lambda x: Chem.MolToSmiles(x))
         if task == 'Compound-Protein Binding Affinity':
             # Convert Y^ from pIC50 to IC50
@@ -1040,13 +1040,13 @@ def create_html_report(df, file=None, task=None, opts=(), progress=gr.Progress(t
     columns_unique = None
     if 'Compound' in df_html.columns and 'Exclude Molecular Graph' not in opts:
-        df_html['Compound'] = df_html['Compound'].swifter.apply(
             lambda x: PandasTools.PrintAsImageString(x) if not pd.isna(x) else x)
     else:
         df_html.drop(['Compound'], axis=1, inplace=True)
     if 'Scaffold' in df_html.columns and 'Exclude Scaffold Graph' not in opts:
-        df_html['Scaffold'] = df_html['Scaffold'].swifter.apply(
             lambda x: PandasTools.PrintAsImageString(x) if not pd.isna(x) else x)
     else:
         df_html.drop(['Scaffold'], axis=1, inplace=True)
@@ -1076,7 +1076,7 @@ def create_html_report(df, file=None, task=None, opts=(), progress=gr.Progress(t
     df_html.rename(columns=column_aliases, inplace=True)
     df_html.index.name = 'Index'
     if 'Target FASTA' in df_html.columns:
-        df_html['Target FASTA'] = df_html['Target FASTA'].swifter.apply(
             lambda x: wrap_text(x) if not pd.isna(x) else x)
     num_cols = df_html.select_dtypes('number').columns
@@ -1094,7 +1094,7 @@ def create_html_report(df, file=None, task=None, opts=(), progress=gr.Progress(t
         if 'Target ID' in df_html.columns:
             df_html.drop(['Target FASTA'], axis=1, inplace=True)
         if 'Target FASTA' in df_html.columns:
-            df_html['Target FASTA'] = df_html['Target FASTA'].swifter.apply(
                 lambda x: wrap_text(x) if not pd.isna(x) else x)
         if 'Scaffold SMILES' in df_html.columns:
             df_html.drop(['Scaffold SMILES'], axis=1, inplace=True)
@@ -1159,7 +1159,9 @@ def create_html_report(df, file=None, task=None, opts=(), progress=gr.Progress(t
         report_table = pn.widgets.Tabulator(
             df_html, formatters=formatters,
-            frozen_columns=['Index', 'Target ID', 'Compound ID', 'Compound', 'Scaffold'],
             disabled=True, sizing_mode='stretch_both', pagination='local', page_size=30)
         for i, col in enumerate(num_cols):
@@ -1314,7 +1316,7 @@ def create_pie_chart(df, category, value, top_k):
         ("Percentage", "@proportion{0.0%}")
     ]
-    if category == 'Scaffold SMILES':
         data = data.merge(top_k_df[['Scaffold SMILES', 'Scaffold']].drop_duplicates(), how='left',
                           left_on='Scaffold SMILES', right_on='Scaffold SMILES')
         tooltips.append(("Scaffold", "<div>@{Scaffold}{safe}</div>"))
@@ -1353,11 +1355,11 @@ def submit_report(df, score_list, filter_list, task, progress=gr.Progress(track_
     df_report = df.copy()
     try:
         for filter_name in filter_list:
-            df_report[filter_name] = df_report['Compound'].swifter.apply(
                 lambda x: FILTER_MAP[filter_name](x) if not pd.isna(x) else x)
         for score_name in score_list:
-            df_report[score_name] = df_report['Compound'].swifter.apply(
                 lambda x: SCORE_MAP[score_name](x) if not pd.isna(x) else x)
         return (create_html_report(df_report, file=None, task=task), df_report,
@@ -1990,7 +1992,7 @@ with gr.Blocks(theme=theme, title='DeepSEQreen', css=CSS, delete_cache=(3600, 48
                 alignment = aligner.align(processed_fasta, query)
                 return alignment.score / max(len(processed_fasta), len(query))
-            alignment_df['score'] = alignment_df['X2'].swifter.apply(align_score)
             row = alignment_df.loc[alignment_df['score'].idxmax()]
             family = str(row['Target Family']).title()
             return gr.Dropdown(value=family,
@@ -2316,13 +2318,13 @@ QALAHAYFAQYHDPDDEPVADPYDQSFESRDLLIDEWKSLTYDEVISFVPPPLDQEEMES
             infer_df = pd.read_csv(drug_target_pair_upload)
             validate_columns(infer_df, ['X1', 'X2'])
-            infer_df['X1_ERR'] = infer_df['X1'].swifter.apply(
                 validate_seq_str, regex=SMILES_PAT)
             if not infer_df['X1_ERR'].isna().all():
                 raise ValueError(
                     f"Encountered invalid SMILES:\n{infer_df[~infer_df['X1_ERR'].isna()][['X1', 'X1_ERR']]}")
-            infer_df['X2_ERR'] = infer_df['X2'].swifter.apply(
                 validate_seq_str, regex=FASTA_PAT)
             if not infer_df['X2_ERR'].isna().all():
                 raise ValueError(
@@ -2546,6 +2548,8 @@ QALAHAYFAQYHDPDDEPVADPYDQSFESRDLLIDEWKSLTYDEVISFVPPPLDQEEMES
                                              info=f'Found {label} in your uploaded dataset. '
                                                   'Is it compound-protein interaction or binding affinity?'),
                     html_report: ''}
     report_df_change = file_for_report.change(
@@ -2562,7 +2566,7 @@ QALAHAYFAQYHDPDDEPVADPYDQSFESRDLLIDEWKSLTYDEVISFVPPPLDQEEMES
         concurrency_limit=100,
     ).success(
         fn=inquire_task, inputs=[raw_df],
-        outputs=[report_task, html_report, analyze_btn, csv_generate, html_generate],
     )
     file_for_report.clear(

 from apscheduler.schedulers.background import BackgroundScheduler
 from tinydb import TinyDB, Query
+# import swifter
 from tqdm.auto import tqdm
 from deepscreen.data.dti import validate_seq_str, rdkit_canonicalize, FASTA_PAT, SMILES_PAT
         orig_df['Target Family'] = None
     if orig_df['Target Family'].isna().any():
         orig_df.loc[orig_df['Target Family'].isna(), 'Target Family'] = (
+            orig_df.loc[orig_df['Target Family'].isna(), 'X2'].parallel_apply(detect_family)
         )
     detect_family.cache_clear()
         if "Include Compound Max. Tanimoto Similarity to Training Compounds" in opts:
             for family in prediction_df['Target Family'].unique():
                 family_smiles_df = get_seen_smiles(family=family, task=task_value)
+                family_smiles_df['FP'] = family_smiles_df['X1'].parallel_apply(smiles_to_ecfp)
                 @cache
                 def max_sim(smi):
                 prediction_df.loc[
                     prediction_df['Target Family'] == family, 'Max. Tanimoto Similarity to Training Compounds'] = (
+                    prediction_df.loc[prediction_df['Target Family'] == family, 'X1'].parallel_apply(max_sim)
                 )
                 max_sim.cache_clear()
                 return max_tanimoto_similarity(smiles, seen_smiles_with_fp=pos_compounds_df)
             prediction_df[['Max. Tanimoto Similarity', 'Max. Tanimoto Similarity Compound']] = (
+                prediction_df['X1'].parallel_apply(max_sim).apply(pd.Series)
             )
             max_sim.cache_clear()
         if "Include Target Max. Sequence Identity to Known Interacting Targets of Compound" in opts:
             x2 = prediction_df['X2'].iloc[0]
+            prediction_df['X1^'] = prediction_df['X1'].parallel_apply(rdkit_canonicalize)
             @cache
             def calculate_max_sequence_identity(compound):
                 return max_sequence_identity(x2, seen_fastas=compound_targets)
             prediction_df[['Max. Sequence Identity', 'Max. Sequence Identity Target']] = (
+                prediction_df['X1^'].parallel_apply(calculate_max_sequence_identity).apply(pd.Series)
             )
             prediction_df.drop(['X1^'], axis=1, inplace=True)
                 prediction_df.loc[
                     prediction_df['Target Family'] == family, 'Max. Sequence Identity to Training Targets'] = (
+                    prediction_df.loc[prediction_df['Target Family'] == family, 'X2'].parallel_apply(max_id)
                 )
                 max_id.cache_clear()
         if 'X1' in df.columns:
             if 'Compound' not in df.columns or df['Compound'].dtype != 'object':
+                df['Compound'] = df['X1'].parallel_apply(
                     lambda smiles: PandasTools._MolPlusFingerprint(Chem.MolFromSmiles(smiles)))
+            df['Scaffold'] = df['Compound'].parallel_apply(MurckoScaffold.GetScaffoldForMol)
+            df['Scaffold SMILES'] = df['Scaffold'].parallel_apply(lambda x: Chem.MolToSmiles(x))
         if task == 'Compound-Protein Binding Affinity':
             # Convert Y^ from pIC50 to IC50
     columns_unique = None
     if 'Compound' in df_html.columns and 'Exclude Molecular Graph' not in opts:
+        df_html['Compound'] = df_html['Compound'].parallel_apply(
             lambda x: PandasTools.PrintAsImageString(x) if not pd.isna(x) else x)
     else:
         df_html.drop(['Compound'], axis=1, inplace=True)
     if 'Scaffold' in df_html.columns and 'Exclude Scaffold Graph' not in opts:
+        df_html['Scaffold'] = df_html['Scaffold'].parallel_apply(
             lambda x: PandasTools.PrintAsImageString(x) if not pd.isna(x) else x)
     else:
         df_html.drop(['Scaffold'], axis=1, inplace=True)
     df_html.rename(columns=column_aliases, inplace=True)
     df_html.index.name = 'Index'
     if 'Target FASTA' in df_html.columns:
+        df_html['Target FASTA'] = df_html['Target FASTA'].parallel_apply(
             lambda x: wrap_text(x) if not pd.isna(x) else x)
     num_cols = df_html.select_dtypes('number').columns
         if 'Target ID' in df_html.columns:
             df_html.drop(['Target FASTA'], axis=1, inplace=True)
         if 'Target FASTA' in df_html.columns:
+            df_html['Target FASTA'] = df_html['Target FASTA'].parallel_apply(
                 lambda x: wrap_text(x) if not pd.isna(x) else x)
         if 'Scaffold SMILES' in df_html.columns:
             df_html.drop(['Scaffold SMILES'], axis=1, inplace=True)
         report_table = pn.widgets.Tabulator(
             df_html, formatters=formatters,
+            frozen_columns=[col for col in df_html.columns if col in [
+                'Index', 'Target ID', 'Compound ID', 'Compound', 'Scaffold'
+            ]],
             disabled=True, sizing_mode='stretch_both', pagination='local', page_size=30)
         for i, col in enumerate(num_cols):
         ("Percentage", "@proportion{0.0%}")
     ]
+    if category == 'Scaffold SMILES' and 'Scaffold' in df.columns:
         data = data.merge(top_k_df[['Scaffold SMILES', 'Scaffold']].drop_duplicates(), how='left',
                           left_on='Scaffold SMILES', right_on='Scaffold SMILES')
         tooltips.append(("Scaffold", "<div>@{Scaffold}{safe}</div>"))
     df_report = df.copy()
     try:
         for filter_name in filter_list:
+            df_report[filter_name] = df_report['Compound'].parallel_apply(
                 lambda x: FILTER_MAP[filter_name](x) if not pd.isna(x) else x)
         for score_name in score_list:
+            df_report[score_name] = df_report['Compound'].parallel_apply(
                 lambda x: SCORE_MAP[score_name](x) if not pd.isna(x) else x)
         return (create_html_report(df_report, file=None, task=task), df_report,
                 alignment = aligner.align(processed_fasta, query)
                 return alignment.score / max(len(processed_fasta), len(query))
+            alignment_df['score'] = alignment_df['X2'].parallel_apply(align_score)
             row = alignment_df.loc[alignment_df['score'].idxmax()]
             family = str(row['Target Family']).title()
             return gr.Dropdown(value=family,
             infer_df = pd.read_csv(drug_target_pair_upload)
             validate_columns(infer_df, ['X1', 'X2'])
+            infer_df['X1_ERR'] = infer_df['X1'].parallel_apply(
                 validate_seq_str, regex=SMILES_PAT)
             if not infer_df['X1_ERR'].isna().all():
                 raise ValueError(
                     f"Encountered invalid SMILES:\n{infer_df[~infer_df['X1_ERR'].isna()][['X1', 'X1_ERR']]}")
+            infer_df['X2_ERR'] = infer_df['X2'].parallel_apply(
                 validate_seq_str, regex=FASTA_PAT)
             if not infer_df['X2_ERR'].isna().all():
                 raise ValueError(
                                              info=f'Found {label} in your uploaded dataset. '
                                                   'Is it compound-protein interaction or binding affinity?'),
                     html_report: ''}
+        else:
+            return {report_task: gr.Dropdown(visible=False)}
     report_df_change = file_for_report.change(
         concurrency_limit=100,
     ).success(
         fn=inquire_task, inputs=[raw_df],
+        outputs=[report_task, html_report],
     )
     file_for_report.clear(