DeepSEQreen_fast_build

Running on CPU Upgrade

App Files Files Community

libokj commited on Apr 18, 2024

Commit

23e9baa

verified ·

1 Parent(s): 391ee30

Update app.py

Browse files

Files changed (1) hide show

app.py +25 -24

app.py CHANGED Viewed

@@ -23,7 +23,7 @@ from email_validator import validate_email, EmailNotValidError
 import gradio as gr
 import hydra
 import pandas as pd
-from pandarallel import pandarallel
 import requests
 from requests.adapters import HTTPAdapter, Retry
 from markdown import markdown
@@ -42,7 +42,7 @@ import panel as pn
 from apscheduler.schedulers.background import BackgroundScheduler
 from tinydb import TinyDB, Query
-# import swifter
 from tqdm.auto import tqdm
 from deepscreen.data.dti import validate_seq_str, rdkit_canonicalize, FASTA_PAT, SMILES_PAT
@@ -741,7 +741,7 @@ def submit_predict(predict_filepath, task, preset, target_family, opts, state):
             orig_df['Target Family'].isna(), 'Target Family'
         ] = orig_df.loc[
             orig_df['Target Family'].isna(), 'X2'
-        ].parallel_apply(detect_family)
     detect_family.cache_clear()
@@ -794,15 +794,15 @@ def submit_predict(predict_filepath, task, preset, target_family, opts, state):
                 subset.to_csv(predict_subset_filepath, index=False, na_rep='')
                 seen_compounds = get_seen_smiles(family, task_value)
-                if subset['X1'].iloc[0] in seen_compounds['X1'].values:
                     scenario = "Seen Compound"
                 else:
                     scenario = "Unseen Compound"
-                filtered_df = benchmark_df[(benchmark_df['Family'] == target_family.title())
                                            & (benchmark_df['Scenario'] == scenario)]
-                preset = filtered_df.loc[filtered_df[score].idxmax(), 'preset']
                 preset_value = PRESET_MAP[preset]
                 target_family = TARGET_FAMILY_MAP[family.title()]
@@ -810,7 +810,8 @@ def submit_predict(predict_filepath, task, preset, target_family, opts, state):
                     config_name="webserver_inference",
                     overrides=[f"task={task_value}",
                                f"preset={preset_value}",
-                               f"ckpt_path=resources/checkpoints/{preset_value}-{task_value}-{target_family}.ckpt",
                                f"data.data_file='{str(predict_subset_filepath)}'"])
                 predictions, _ = predict(cfg)
@@ -822,14 +823,14 @@ def submit_predict(predict_filepath, task, preset, target_family, opts, state):
         prediction_df = pd.concat([prediction_df, annotated_df], ignore_index=True)
         # prediction_df['Max. Tanimoto Similarity'] = prediction_df.groupby('Target Family')['X1'].apply(
-        #     lambda group: group.parallel_apply(
         #         max_tanimoto_similarity,
         #         seen_smiles=tuple(get_seen_smiles(family=group.name, task=task_value))
         #     )
         # ).values
         #
         # prediction_df['Max. Sequence Identity'] = prediction_df.groupby('Target Family')['X2'].apply(
-        #     lambda group: group.parallel_apply(
         #         max_sequence_identity,
         #         seen_fastas=tuple(get_seen_fastas(family=group.name, task=task_value))
         #     )
@@ -838,7 +839,7 @@ def submit_predict(predict_filepath, task, preset, target_family, opts, state):
             for family in prediction_df['Target Family'].unique():
                 prediction_df.loc[
                     prediction_df['Target Family'] == family, 'Max. Tanimoto Similarity'] = prediction_df.loc[
-                    prediction_df['Target Family'] == family, 'X1'].parallel_apply(
                     max_tanimoto_similarity,
                     seen_smiles=tuple(get_seen_smiles(family=family, task=task_value))
                 )
@@ -847,7 +848,7 @@ def submit_predict(predict_filepath, task, preset, target_family, opts, state):
             for family in prediction_df['Target Family'].unique():
                 prediction_df.loc[
                     prediction_df['Target Family'] == family, 'Max. Sequence Identity'] = prediction_df.loc[
-                    prediction_df['Target Family'] == family, 'X2'].parallel_apply(
                     max_sequence_identity,
                     seen_fastas=tuple(get_seen_fastas(family=family, task=task_value))
                 )
@@ -902,10 +903,10 @@ def update_df(file, progress=gr.Progress(track_tqdm=True)):
         if 'X1' in df.columns:
             if 'Compound' not in df.columns or df['Compound'].dtype != 'object':
-                df['Compound'] = df['X1'].parallel_apply(
                     lambda smiles: PandasTools._MolPlusFingerprint(Chem.MolFromSmiles(smiles)))
-            df['Scaffold'] = df['Compound'].parallel_apply(MurckoScaffold.GetScaffoldForMol)
-            df['Scaffold SMILES'] = df['Scaffold'].parallel_apply(lambda x: Chem.MolToSmiles(x))
         if task == 'Compound-Protein Binding Affinity':
             # Convert Y^ from pIC50 to IC50
@@ -986,13 +987,13 @@ def create_html_report(df, file=None, task=None, opts=(), progress=gr.Progress(t
         elif 'Y^' in df_html.columns:
             job = 'Interaction Pair Inference'
     if 'Compound' in df_html.columns and 'Exclude Molecular Graph' not in opts:
-        df_html['Compound'] = df_html['Compound'].parallel_apply(
             lambda x: PandasTools.PrintAsImageString(x) if not pd.isna(x) else x)
     else:
         df_html.drop(['Compound'], axis=1, inplace=True)
     if 'Scaffold' in df_html.columns and 'Exclude Scaffold Graph' not in opts:
-        df_html['Scaffold'] = df_html['Scaffold'].parallel_apply(
             lambda x: PandasTools.PrintAsImageString(x) if not pd.isna(x) else x)
     else:
         df_html.drop(['Scaffold'], axis=1, inplace=True)
@@ -1000,7 +1001,7 @@ def create_html_report(df, file=None, task=None, opts=(), progress=gr.Progress(t
     df_html.rename(columns=column_aliases, inplace=True)
     df_html.index.name = 'Index'
     if 'Target FASTA' in df_html.columns:
-        df_html['Target FASTA'] = df_html['Target FASTA'].parallel_apply(
             lambda x: wrap_text(x) if not pd.isna(x) else x)
     num_cols = df_html.select_dtypes('number').columns
@@ -1018,7 +1019,7 @@ def create_html_report(df, file=None, task=None, opts=(), progress=gr.Progress(t
         if 'Target ID' in df_html.columns:
             df_html.drop(['Target FASTA'], axis=1, inplace=True)
         if 'Target FASTA' in df_html.columns:
-            df_html['Target FASTA'] = df_html['Target FASTA'].parallel_apply(
                 lambda x: wrap_text(x) if not pd.isna(x) else x)
         if 'Scaffold SMILES' in df_html.columns:
             df_html.drop(['Scaffold SMILES'], axis=1, inplace=True)
@@ -1272,11 +1273,11 @@ def submit_report(df, score_list, filter_list, task, progress=gr.Progress(track_
     df_report = df.copy()
     try:
         for filter_name in filter_list:
-            df_report[filter_name] = df_report['Compound'].parallel_apply(
                 lambda x: FILTER_MAP[filter_name](x) if not pd.isna(x) else x)
         for score_name in score_list:
-            df_report[score_name] = df_report['Compound'].parallel_apply(
                 lambda x: SCORE_MAP[score_name](x) if not pd.isna(x) else x)
         # pie_chart = None
@@ -1918,7 +1919,7 @@ with gr.Blocks(theme=theme, title='DeepSEQreen', css=CSS, delete_cache=(3600, 48
                 alignment = aligner.align(processed_fasta, query)
                 return alignment.score / max(len(processed_fasta), len(query))
-            alignment_df['score'] = alignment_df['X2'].parallel_apply(align_score)
             row = alignment_df.loc[alignment_df['score'].idxmax()]
             family = str(row['Target Family']).title()
             return gr.Dropdown(value=family,
@@ -2239,13 +2240,13 @@ QALAHAYFAQYHDPDDEPVADPYDQSFESRDLLIDEWKSLTYDEVISFVPPPLDQEEMES
             infer_df = pd.read_csv(drug_target_pair_upload)
             validate_columns(infer_df, ['X1', 'X2'])
-            infer_df['X1_ERR'] = infer_df['X1'].parallel_apply(
                 validate_seq_str, regex=SMILES_PAT)
             if not infer_df['X1_ERR'].isna().all():
                 raise ValueError(
                     f"Encountered invalid SMILES:\n{infer_df[~infer_df['X1_ERR'].isna()][['X1', 'X1_ERR']]}")
-            infer_df['X2_ERR'] = infer_df['X2'].parallel_apply(
                 validate_seq_str, regex=FASTA_PAT)
             if not infer_df['X2_ERR'].isna().all():
                 raise ValueError(
@@ -2564,7 +2565,7 @@ QALAHAYFAQYHDPDDEPVADPYDQSFESRDLLIDEWKSLTYDEVISFVPPPLDQEEMES
 if __name__ == "__main__":
-    pandarallel.initialize()
     hydra.initialize(version_base="1.3", config_path="configs", job_name="webserver_inference")
     demo.queue(default_concurrency_limit=None, max_size=10).launch(show_api=False)
     scheduler.add_job(check_expiry, 'interval', hours=1)

 import gradio as gr
 import hydra
 import pandas as pd
+# from pandarallel import pandarallel
 import requests
 from requests.adapters import HTTPAdapter, Retry
 from markdown import markdown
 from apscheduler.schedulers.background import BackgroundScheduler
 from tinydb import TinyDB, Query
+import swifter
 from tqdm.auto import tqdm
 from deepscreen.data.dti import validate_seq_str, rdkit_canonicalize, FASTA_PAT, SMILES_PAT
             orig_df['Target Family'].isna(), 'Target Family'
         ] = orig_df.loc[
             orig_df['Target Family'].isna(), 'X2'
+        ].swifter.apply(detect_family)
     detect_family.cache_clear()
                 subset.to_csv(predict_subset_filepath, index=False, na_rep='')
                 seen_compounds = get_seen_smiles(family, task_value)
+                if subset['X1'].iloc[0] in seen_compounds:
                     scenario = "Seen Compound"
                 else:
                     scenario = "Unseen Compound"
+                filtered_df = benchmark_df[(benchmark_df['Family'] == family.title())
                                            & (benchmark_df['Scenario'] == scenario)]
+                preset = filtered_df.loc[filtered_df[score].idxmax(), 'Model']
                 preset_value = PRESET_MAP[preset]
                 target_family = TARGET_FAMILY_MAP[family.title()]
                     config_name="webserver_inference",
                     overrides=[f"task={task_value}",
                                f"preset={preset_value}",
+                               f"ckpt_path=D:/checkpoints/{preset_value}-{task_value}-{target_family}.ckpt",
+                                # f"ckpt_path=resources/checkpoints/{preset_value}-{task_value}-{target_family}.ckpt",
                                f"data.data_file='{str(predict_subset_filepath)}'"])
                 predictions, _ = predict(cfg)
         prediction_df = pd.concat([prediction_df, annotated_df], ignore_index=True)
         # prediction_df['Max. Tanimoto Similarity'] = prediction_df.groupby('Target Family')['X1'].apply(
+        #     lambda group: group.swifter.apply(
         #         max_tanimoto_similarity,
         #         seen_smiles=tuple(get_seen_smiles(family=group.name, task=task_value))
         #     )
         # ).values
         #
         # prediction_df['Max. Sequence Identity'] = prediction_df.groupby('Target Family')['X2'].apply(
+        #     lambda group: group.swifter.apply(
         #         max_sequence_identity,
         #         seen_fastas=tuple(get_seen_fastas(family=group.name, task=task_value))
         #     )
             for family in prediction_df['Target Family'].unique():
                 prediction_df.loc[
                     prediction_df['Target Family'] == family, 'Max. Tanimoto Similarity'] = prediction_df.loc[
+                    prediction_df['Target Family'] == family, 'X1'].swifter.apply(
                     max_tanimoto_similarity,
                     seen_smiles=tuple(get_seen_smiles(family=family, task=task_value))
                 )
             for family in prediction_df['Target Family'].unique():
                 prediction_df.loc[
                     prediction_df['Target Family'] == family, 'Max. Sequence Identity'] = prediction_df.loc[
+                    prediction_df['Target Family'] == family, 'X2'].swifter.apply(
                     max_sequence_identity,
                     seen_fastas=tuple(get_seen_fastas(family=family, task=task_value))
                 )
         if 'X1' in df.columns:
             if 'Compound' not in df.columns or df['Compound'].dtype != 'object':
+                df['Compound'] = df['X1'].swifter.apply(
                     lambda smiles: PandasTools._MolPlusFingerprint(Chem.MolFromSmiles(smiles)))
+            df['Scaffold'] = df['Compound'].swifter.apply(MurckoScaffold.GetScaffoldForMol)
+            df['Scaffold SMILES'] = df['Scaffold'].swifter.apply(lambda x: Chem.MolToSmiles(x))
         if task == 'Compound-Protein Binding Affinity':
             # Convert Y^ from pIC50 to IC50
         elif 'Y^' in df_html.columns:
             job = 'Interaction Pair Inference'
     if 'Compound' in df_html.columns and 'Exclude Molecular Graph' not in opts:
+        df_html['Compound'] = df_html['Compound'].swifter.apply(
             lambda x: PandasTools.PrintAsImageString(x) if not pd.isna(x) else x)
     else:
         df_html.drop(['Compound'], axis=1, inplace=True)
     if 'Scaffold' in df_html.columns and 'Exclude Scaffold Graph' not in opts:
+        df_html['Scaffold'] = df_html['Scaffold'].swifter.apply(
             lambda x: PandasTools.PrintAsImageString(x) if not pd.isna(x) else x)
     else:
         df_html.drop(['Scaffold'], axis=1, inplace=True)
     df_html.rename(columns=column_aliases, inplace=True)
     df_html.index.name = 'Index'
     if 'Target FASTA' in df_html.columns:
+        df_html['Target FASTA'] = df_html['Target FASTA'].swifter.apply(
             lambda x: wrap_text(x) if not pd.isna(x) else x)
     num_cols = df_html.select_dtypes('number').columns
         if 'Target ID' in df_html.columns:
             df_html.drop(['Target FASTA'], axis=1, inplace=True)
         if 'Target FASTA' in df_html.columns:
+            df_html['Target FASTA'] = df_html['Target FASTA'].swifter.apply(
                 lambda x: wrap_text(x) if not pd.isna(x) else x)
         if 'Scaffold SMILES' in df_html.columns:
             df_html.drop(['Scaffold SMILES'], axis=1, inplace=True)
     df_report = df.copy()
     try:
         for filter_name in filter_list:
+            df_report[filter_name] = df_report['Compound'].swifter.apply(
                 lambda x: FILTER_MAP[filter_name](x) if not pd.isna(x) else x)
         for score_name in score_list:
+            df_report[score_name] = df_report['Compound'].swifter.apply(
                 lambda x: SCORE_MAP[score_name](x) if not pd.isna(x) else x)
         # pie_chart = None
                 alignment = aligner.align(processed_fasta, query)
                 return alignment.score / max(len(processed_fasta), len(query))
+            alignment_df['score'] = alignment_df['X2'].swifter.apply(align_score)
             row = alignment_df.loc[alignment_df['score'].idxmax()]
             family = str(row['Target Family']).title()
             return gr.Dropdown(value=family,
             infer_df = pd.read_csv(drug_target_pair_upload)
             validate_columns(infer_df, ['X1', 'X2'])
+            infer_df['X1_ERR'] = infer_df['X1'].swifter.apply(
                 validate_seq_str, regex=SMILES_PAT)
             if not infer_df['X1_ERR'].isna().all():
                 raise ValueError(
                     f"Encountered invalid SMILES:\n{infer_df[~infer_df['X1_ERR'].isna()][['X1', 'X1_ERR']]}")
+            infer_df['X2_ERR'] = infer_df['X2'].swifter.apply(
                 validate_seq_str, regex=FASTA_PAT)
             if not infer_df['X2_ERR'].isna().all():
                 raise ValueError(
 if __name__ == "__main__":
+    # pandarallel.initialize()
     hydra.initialize(version_base="1.3", config_path="configs", job_name="webserver_inference")
     demo.queue(default_concurrency_limit=None, max_size=10).launch(show_api=False)
     scheduler.add_job(check_expiry, 'interval', hours=1)