libokj commited on
Commit
f27ac27
·
verified ·
1 Parent(s): 7123c5f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +23 -23
app.py CHANGED
@@ -45,7 +45,7 @@ import panel as pn
45
  from apscheduler.schedulers.background import BackgroundScheduler
46
  from tinydb import TinyDB, Query
47
 
48
- import swifter
49
  from tqdm.auto import tqdm
50
 
51
  from deepscreen.data.dti import validate_seq_str, rdkit_canonicalize, FASTA_PAT, SMILES_PAT
@@ -837,7 +837,7 @@ def apply_advanced_opts(prediction_df, opts, df_training):
837
  if "Calculate Max. Tanimoto Similarity between the Hit Compound and Known Ligands of the Input Target" in opts:
838
  x2 = prediction_df['X2'].iloc[0]
839
  pos_compounds_df = df_training.loc[(df_training['X2'] == x2) & (df_training['Y'] == 1)].copy()
840
- pos_compounds_df['FP'] = pos_compounds_df['X1'].swifter.apply(smiles_to_ecfp)
841
 
842
  @cache
843
  def max_sim(smiles):
@@ -846,13 +846,13 @@ def apply_advanced_opts(prediction_df, opts, df_training):
846
  prediction_df[[
847
  'Max. Tanimoto Similarity to Known Ligands',
848
  'Max. Sim. Ligand'
849
- ]] = prediction_df['X1'].swifter.apply(max_sim).apply(pd.Series)
850
 
851
  max_sim.cache_clear()
852
 
853
  if "Calculate Max. Sequence Identity between the Input Target and Known Targets of Hit Compound" in opts:
854
  x2 = prediction_df['X2'].iloc[0]
855
- prediction_df['X1^'] = prediction_df['X1'].swifter.apply(rdkit_canonicalize)
856
 
857
  @cache
858
  def max_id(compound):
@@ -861,7 +861,7 @@ def apply_advanced_opts(prediction_df, opts, df_training):
861
 
862
  prediction_df[['Max. Sequence Identity to Known Targets of Hit Compound',
863
  'Max. Id. Target']] = (
864
- prediction_df['X1^'].swifter.apply(max_id).apply(pd.Series)
865
  )
866
  prediction_df.drop(['X1^'], axis=1, inplace=True)
867
 
@@ -870,7 +870,7 @@ def apply_advanced_opts(prediction_df, opts, df_training):
870
  # Advanced options for Target Protein Identification
871
  if "Calculate Max. Tanimoto Similarity between the Input Compound and Compounds in the Training Set" in opts:
872
  x1 = rdkit_canonicalize(prediction_df['X1'].iloc[0])
873
- df_training['FP'] = df_training['X1'].swifter.apply(smiles_to_ecfp)
874
 
875
  prediction_df[[
876
  'Max. Tanimoto Similarity to Training Compounds',
@@ -888,7 +888,7 @@ def apply_advanced_opts(prediction_df, opts, df_training):
888
  prediction_df[[
889
  'Max. Sequence Identity to Known Targets of Input Compound',
890
  'Max. Id. Target'
891
- ]] = prediction_df['X2'].swifter.apply(max_id).apply(pd.Series)
892
 
893
  max_id.cache_clear()
894
 
@@ -904,7 +904,7 @@ def apply_advanced_opts(prediction_df, opts, df_training):
904
  prediction_df[[
905
  'Max. Tanimoto Similarity to Known Ligands of Identified Target',
906
  'Max. Sim. Ligand'
907
- ]] = prediction_df['X2'].swifter.apply(max_sim).apply(pd.Series)
908
 
909
  max_sim.cache_clear()
910
 
@@ -949,12 +949,12 @@ def submit_predict(predict_filepath, task, preset, target_family, opts, job_info
949
  orig_df['Target Family'] = None
950
  if orig_df['Target Family'].isna().any():
951
  orig_df.loc[orig_df['Target Family'].isna(), 'Target Family'] = (
952
- orig_df.loc[orig_df['Target Family'].isna(), 'X2'].swifter.apply(detect_family)
953
  )
954
  orig_df['Target Family'] = orig_df['Target Family'].str.capitalize()
955
  detect_family.cache_clear()
956
 
957
- orig_df['X1^'] = orig_df['X1'].swifter.apply(rdkit_canonicalize)
958
 
959
  orig_df = orig_df.merge(df_training[['X1^', 'X2', 'Y']], on=['X1^', 'X2'], how='left', indicator=False)
960
  annotated_df = orig_df[~orig_df['Y'].isna()].copy()
@@ -1109,10 +1109,10 @@ def update_df(file, progress=gr.Progress(track_tqdm=True)):
1109
 
1110
  if 'X1' in df.columns:
1111
  if 'Compound' not in df.columns or df['Compound'].dtype != 'object':
1112
- df['Compound'] = df['X1'].swifter.apply(
1113
  lambda smiles: PandasTools._MolPlusFingerprint(Chem.MolFromSmiles(smiles)))
1114
- df['Scaffold'] = df['Compound'].swifter.apply(MurckoScaffold.GetScaffoldForMol)
1115
- df['Scaffold SMILES'] = df['Scaffold'].swifter.apply(lambda x: Chem.MolToSmiles(x))
1116
  df['Pharmacophore'] = None
1117
  if task == 'Compound-Protein Binding Affinity':
1118
  # Convert Y^ from pIC50 (nM) to IC50 (nM)
@@ -1182,17 +1182,17 @@ def create_html_report(df, file=None, task=None, opts=(), progress=gr.Progress(t
1182
  columns_unique = None
1183
 
1184
  if 'Exclude Pharmacophore 3D' not in opts:
1185
- df_html['Pharmacophore'] = df_html['Compound'].swifter.apply(
1186
  lambda x: mol_to_pharm3d(x) if not pd.isna(x) else x)
1187
 
1188
  if 'Compound' in df_html.columns and 'Exclude Molecular Graph' not in opts:
1189
- df_html['Compound'] = df_html['Compound'].swifter.apply(
1190
  lambda x: PandasTools.PrintAsImageString(x) if not pd.isna(x) else x)
1191
  else:
1192
  df_html.drop(['Compound'], axis=1, inplace=True)
1193
 
1194
  if 'Scaffold' in df_html.columns and 'Exclude Scaffold Graph' not in opts:
1195
- df_html['Scaffold'] = df_html['Scaffold'].swifter.apply(
1196
  lambda x: PandasTools.PrintAsImageString(x) if not pd.isna(x) else x)
1197
  else:
1198
  df_html.drop(['Scaffold'], axis=1, inplace=True)
@@ -1227,7 +1227,7 @@ def create_html_report(df, file=None, task=None, opts=(), progress=gr.Progress(t
1227
  df_html.rename(columns=column_aliases, inplace=True)
1228
  df_html.index.name = 'Index'
1229
  if 'Target FASTA' in df_html.columns:
1230
- df_html['Target FASTA'] = df_html['Target FASTA'].swifter.apply(
1231
  lambda x: wrap_text(x) if not pd.isna(x) else x)
1232
 
1233
  num_cols = df_html.select_dtypes('number').columns
@@ -1247,7 +1247,7 @@ def create_html_report(df, file=None, task=None, opts=(), progress=gr.Progress(t
1247
  if 'Target ID' in df_html.columns:
1248
  df_html.drop(['Target FASTA'], axis=1, inplace=True)
1249
  if 'Target FASTA' in df_html.columns:
1250
- df_html['Target FASTA'] = df_html['Target FASTA'].swifter.apply(
1251
  lambda x: wrap_text(x) if not pd.isna(x) else x)
1252
  if 'Scaffold SMILES' in df_html.columns:
1253
  df_html.drop(['Scaffold SMILES'], axis=1, inplace=True)
@@ -1555,11 +1555,11 @@ def submit_report(df, score_list, filter_list, opt_list, task, progress=gr.Progr
1555
  df_report = df.copy()
1556
  try:
1557
  for filter_name in filter_list:
1558
- df_report[filter_name] = df_report['Compound'].swifter.apply(
1559
  lambda x: FILTER_MAP[filter_name](x) if not pd.isna(x) else x)
1560
 
1561
  for score_name in score_list:
1562
- df_report[score_name] = df_report['Compound'].swifter.apply(
1563
  lambda x: SCORE_MAP[score_name](x) if not pd.isna(x) else x)
1564
 
1565
  if opt_list:
@@ -2263,7 +2263,7 @@ higher similarities usually correspond to greater prediction confidence.<br>
2263
  alignment = aligner.align(processed_fasta, query)
2264
  return alignment.score / max(len(processed_fasta), len(query))
2265
 
2266
- alignment_df['score'] = alignment_df['X2'].swifter.apply(align_score)
2267
  row = alignment_df.loc[alignment_df['score'].idxmax()]
2268
  family = str(row['Target Family']).title()
2269
  return gr.Dropdown(value=family,
@@ -2595,13 +2595,13 @@ QALAHAYFAQYHDPDDEPVADPYDQSFESRDLLIDEWKSLTYDEVISFVPPPLDQEEMES
2595
  infer_df = pd.read_csv(drug_target_pair_upload)
2596
  validate_columns(infer_df, ['X1', 'X2'])
2597
 
2598
- infer_df['X1_ERR'] = infer_df['X1'].swifter.apply(
2599
  validate_seq_str, regex=SMILES_PAT)
2600
  if not infer_df['X1_ERR'].isna().all():
2601
  raise ValueError(
2602
  f"Encountered invalid SMILES:\n{infer_df[~infer_df['X1_ERR'].isna()][['X1', 'X1_ERR']]}")
2603
 
2604
- infer_df['X2_ERR'] = infer_df['X2'].swifter.apply(
2605
  validate_seq_str, regex=FASTA_PAT)
2606
  if not infer_df['X2_ERR'].isna().all():
2607
  raise ValueError(
 
45
  from apscheduler.schedulers.background import BackgroundScheduler
46
  from tinydb import TinyDB, Query
47
 
48
+ #import swifter
49
  from tqdm.auto import tqdm
50
 
51
  from deepscreen.data.dti import validate_seq_str, rdkit_canonicalize, FASTA_PAT, SMILES_PAT
 
837
  if "Calculate Max. Tanimoto Similarity between the Hit Compound and Known Ligands of the Input Target" in opts:
838
  x2 = prediction_df['X2'].iloc[0]
839
  pos_compounds_df = df_training.loc[(df_training['X2'] == x2) & (df_training['Y'] == 1)].copy()
840
+ pos_compounds_df['FP'] = pos_compounds_df['X1'].parallel_apply(smiles_to_ecfp)
841
 
842
  @cache
843
  def max_sim(smiles):
 
846
  prediction_df[[
847
  'Max. Tanimoto Similarity to Known Ligands',
848
  'Max. Sim. Ligand'
849
+ ]] = prediction_df['X1'].parallel_apply(max_sim).apply(pd.Series)
850
 
851
  max_sim.cache_clear()
852
 
853
  if "Calculate Max. Sequence Identity between the Input Target and Known Targets of Hit Compound" in opts:
854
  x2 = prediction_df['X2'].iloc[0]
855
+ prediction_df['X1^'] = prediction_df['X1'].parallel_apply(rdkit_canonicalize)
856
 
857
  @cache
858
  def max_id(compound):
 
861
 
862
  prediction_df[['Max. Sequence Identity to Known Targets of Hit Compound',
863
  'Max. Id. Target']] = (
864
+ prediction_df['X1^'].parallel_apply(max_id).apply(pd.Series)
865
  )
866
  prediction_df.drop(['X1^'], axis=1, inplace=True)
867
 
 
870
  # Advanced options for Target Protein Identification
871
  if "Calculate Max. Tanimoto Similarity between the Input Compound and Compounds in the Training Set" in opts:
872
  x1 = rdkit_canonicalize(prediction_df['X1'].iloc[0])
873
+ df_training['FP'] = df_training['X1'].parallel_apply(smiles_to_ecfp)
874
 
875
  prediction_df[[
876
  'Max. Tanimoto Similarity to Training Compounds',
 
888
  prediction_df[[
889
  'Max. Sequence Identity to Known Targets of Input Compound',
890
  'Max. Id. Target'
891
+ ]] = prediction_df['X2'].parallel_apply(max_id).apply(pd.Series)
892
 
893
  max_id.cache_clear()
894
 
 
904
  prediction_df[[
905
  'Max. Tanimoto Similarity to Known Ligands of Identified Target',
906
  'Max. Sim. Ligand'
907
+ ]] = prediction_df['X2'].parallel_apply(max_sim).apply(pd.Series)
908
 
909
  max_sim.cache_clear()
910
 
 
949
  orig_df['Target Family'] = None
950
  if orig_df['Target Family'].isna().any():
951
  orig_df.loc[orig_df['Target Family'].isna(), 'Target Family'] = (
952
+ orig_df.loc[orig_df['Target Family'].isna(), 'X2'].parallel_apply(detect_family)
953
  )
954
  orig_df['Target Family'] = orig_df['Target Family'].str.capitalize()
955
  detect_family.cache_clear()
956
 
957
+ orig_df['X1^'] = orig_df['X1'].parallel_apply(rdkit_canonicalize)
958
 
959
  orig_df = orig_df.merge(df_training[['X1^', 'X2', 'Y']], on=['X1^', 'X2'], how='left', indicator=False)
960
  annotated_df = orig_df[~orig_df['Y'].isna()].copy()
 
1109
 
1110
  if 'X1' in df.columns:
1111
  if 'Compound' not in df.columns or df['Compound'].dtype != 'object':
1112
+ df['Compound'] = df['X1'].parallel_apply(
1113
  lambda smiles: PandasTools._MolPlusFingerprint(Chem.MolFromSmiles(smiles)))
1114
+ df['Scaffold'] = df['Compound'].parallel_apply(MurckoScaffold.GetScaffoldForMol)
1115
+ df['Scaffold SMILES'] = df['Scaffold'].parallel_apply(lambda x: Chem.MolToSmiles(x))
1116
  df['Pharmacophore'] = None
1117
  if task == 'Compound-Protein Binding Affinity':
1118
  # Convert Y^ from pIC50 (nM) to IC50 (nM)
 
1182
  columns_unique = None
1183
 
1184
  if 'Exclude Pharmacophore 3D' not in opts:
1185
+ df_html['Pharmacophore'] = df_html['Compound'].parallel_apply(
1186
  lambda x: mol_to_pharm3d(x) if not pd.isna(x) else x)
1187
 
1188
  if 'Compound' in df_html.columns and 'Exclude Molecular Graph' not in opts:
1189
+ df_html['Compound'] = df_html['Compound'].parallel_apply(
1190
  lambda x: PandasTools.PrintAsImageString(x) if not pd.isna(x) else x)
1191
  else:
1192
  df_html.drop(['Compound'], axis=1, inplace=True)
1193
 
1194
  if 'Scaffold' in df_html.columns and 'Exclude Scaffold Graph' not in opts:
1195
+ df_html['Scaffold'] = df_html['Scaffold'].parallel_apply(
1196
  lambda x: PandasTools.PrintAsImageString(x) if not pd.isna(x) else x)
1197
  else:
1198
  df_html.drop(['Scaffold'], axis=1, inplace=True)
 
1227
  df_html.rename(columns=column_aliases, inplace=True)
1228
  df_html.index.name = 'Index'
1229
  if 'Target FASTA' in df_html.columns:
1230
+ df_html['Target FASTA'] = df_html['Target FASTA'].parallel_apply(
1231
  lambda x: wrap_text(x) if not pd.isna(x) else x)
1232
 
1233
  num_cols = df_html.select_dtypes('number').columns
 
1247
  if 'Target ID' in df_html.columns:
1248
  df_html.drop(['Target FASTA'], axis=1, inplace=True)
1249
  if 'Target FASTA' in df_html.columns:
1250
+ df_html['Target FASTA'] = df_html['Target FASTA'].parallel_apply(
1251
  lambda x: wrap_text(x) if not pd.isna(x) else x)
1252
  if 'Scaffold SMILES' in df_html.columns:
1253
  df_html.drop(['Scaffold SMILES'], axis=1, inplace=True)
 
1555
  df_report = df.copy()
1556
  try:
1557
  for filter_name in filter_list:
1558
+ df_report[filter_name] = df_report['Compound'].parallel_apply(
1559
  lambda x: FILTER_MAP[filter_name](x) if not pd.isna(x) else x)
1560
 
1561
  for score_name in score_list:
1562
+ df_report[score_name] = df_report['Compound'].parallel_apply(
1563
  lambda x: SCORE_MAP[score_name](x) if not pd.isna(x) else x)
1564
 
1565
  if opt_list:
 
2263
  alignment = aligner.align(processed_fasta, query)
2264
  return alignment.score / max(len(processed_fasta), len(query))
2265
 
2266
+ alignment_df['score'] = alignment_df['X2'].parallel_apply(align_score)
2267
  row = alignment_df.loc[alignment_df['score'].idxmax()]
2268
  family = str(row['Target Family']).title()
2269
  return gr.Dropdown(value=family,
 
2595
  infer_df = pd.read_csv(drug_target_pair_upload)
2596
  validate_columns(infer_df, ['X1', 'X2'])
2597
 
2598
+ infer_df['X1_ERR'] = infer_df['X1'].parallel_apply(
2599
  validate_seq_str, regex=SMILES_PAT)
2600
  if not infer_df['X1_ERR'].isna().all():
2601
  raise ValueError(
2602
  f"Encountered invalid SMILES:\n{infer_df[~infer_df['X1_ERR'].isna()][['X1', 'X1_ERR']]}")
2603
 
2604
+ infer_df['X2_ERR'] = infer_df['X2'].parallel_apply(
2605
  validate_seq_str, regex=FASTA_PAT)
2606
  if not infer_df['X2_ERR'].isna().all():
2607
  raise ValueError(