libokj commited on
Commit
0dd77ac
·
verified ·
1 Parent(s): 264757c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +27 -23
app.py CHANGED
@@ -43,7 +43,7 @@ import panel as pn
43
  from apscheduler.schedulers.background import BackgroundScheduler
44
  from tinydb import TinyDB, Query
45
 
46
- import swifter
47
  from tqdm.auto import tqdm
48
 
49
  from deepscreen.data.dti import validate_seq_str, rdkit_canonicalize, FASTA_PAT, SMILES_PAT
@@ -786,7 +786,7 @@ def submit_predict(predict_filepath, task, preset, target_family, opts, job_info
786
  orig_df['Target Family'] = None
787
  if orig_df['Target Family'].isna().any():
788
  orig_df.loc[orig_df['Target Family'].isna(), 'Target Family'] = (
789
- orig_df.loc[orig_df['Target Family'].isna(), 'X2'].swifter.apply(detect_family)
790
  )
791
  detect_family.cache_clear()
792
 
@@ -885,7 +885,7 @@ def submit_predict(predict_filepath, task, preset, target_family, opts, job_info
885
  if "Include Compound Max. Tanimoto Similarity to Training Compounds" in opts:
886
  for family in prediction_df['Target Family'].unique():
887
  family_smiles_df = get_seen_smiles(family=family, task=task_value)
888
- family_smiles_df['FP'] = family_smiles_df['X1'].swifter.apply(smiles_to_ecfp)
889
 
890
  @cache
891
  def max_sim(smi):
@@ -893,7 +893,7 @@ def submit_predict(predict_filepath, task, preset, target_family, opts, job_info
893
 
894
  prediction_df.loc[
895
  prediction_df['Target Family'] == family, 'Max. Tanimoto Similarity to Training Compounds'] = (
896
- prediction_df.loc[prediction_df['Target Family'] == family, 'X1'].swifter.apply(max_sim)
897
  )
898
  max_sim.cache_clear()
899
 
@@ -907,13 +907,13 @@ def submit_predict(predict_filepath, task, preset, target_family, opts, job_info
907
  return max_tanimoto_similarity(smiles, seen_smiles_with_fp=pos_compounds_df)
908
 
909
  prediction_df[['Max. Tanimoto Similarity', 'Max. Tanimoto Similarity Compound']] = (
910
- prediction_df['X1'].swifter.apply(max_sim).apply(pd.Series)
911
  )
912
  max_sim.cache_clear()
913
 
914
  if "Include Target Max. Sequence Identity to Known Interacting Targets of Compound" in opts:
915
  x2 = prediction_df['X2'].iloc[0]
916
- prediction_df['X1^'] = prediction_df['X1'].swifter.apply(rdkit_canonicalize)
917
 
918
  @cache
919
  def calculate_max_sequence_identity(compound):
@@ -921,7 +921,7 @@ def submit_predict(predict_filepath, task, preset, target_family, opts, job_info
921
  return max_sequence_identity(x2, seen_fastas=compound_targets)
922
 
923
  prediction_df[['Max. Sequence Identity', 'Max. Sequence Identity Target']] = (
924
- prediction_df['X1^'].swifter.apply(calculate_max_sequence_identity).apply(pd.Series)
925
  )
926
  prediction_df.drop(['X1^'], axis=1, inplace=True)
927
 
@@ -937,7 +937,7 @@ def submit_predict(predict_filepath, task, preset, target_family, opts, job_info
937
 
938
  prediction_df.loc[
939
  prediction_df['Target Family'] == family, 'Max. Sequence Identity to Training Targets'] = (
940
- prediction_df.loc[prediction_df['Target Family'] == family, 'X2'].swifter.apply(max_id)
941
  )
942
  max_id.cache_clear()
943
 
@@ -991,10 +991,10 @@ def update_df(file, progress=gr.Progress(track_tqdm=True)):
991
 
992
  if 'X1' in df.columns:
993
  if 'Compound' not in df.columns or df['Compound'].dtype != 'object':
994
- df['Compound'] = df['X1'].swifter.apply(
995
  lambda smiles: PandasTools._MolPlusFingerprint(Chem.MolFromSmiles(smiles)))
996
- df['Scaffold'] = df['Compound'].swifter.apply(MurckoScaffold.GetScaffoldForMol)
997
- df['Scaffold SMILES'] = df['Scaffold'].swifter.apply(lambda x: Chem.MolToSmiles(x))
998
 
999
  if task == 'Compound-Protein Binding Affinity':
1000
  # Convert Y^ from pIC50 to IC50
@@ -1040,13 +1040,13 @@ def create_html_report(df, file=None, task=None, opts=(), progress=gr.Progress(t
1040
  columns_unique = None
1041
 
1042
  if 'Compound' in df_html.columns and 'Exclude Molecular Graph' not in opts:
1043
- df_html['Compound'] = df_html['Compound'].swifter.apply(
1044
  lambda x: PandasTools.PrintAsImageString(x) if not pd.isna(x) else x)
1045
  else:
1046
  df_html.drop(['Compound'], axis=1, inplace=True)
1047
 
1048
  if 'Scaffold' in df_html.columns and 'Exclude Scaffold Graph' not in opts:
1049
- df_html['Scaffold'] = df_html['Scaffold'].swifter.apply(
1050
  lambda x: PandasTools.PrintAsImageString(x) if not pd.isna(x) else x)
1051
  else:
1052
  df_html.drop(['Scaffold'], axis=1, inplace=True)
@@ -1076,7 +1076,7 @@ def create_html_report(df, file=None, task=None, opts=(), progress=gr.Progress(t
1076
  df_html.rename(columns=column_aliases, inplace=True)
1077
  df_html.index.name = 'Index'
1078
  if 'Target FASTA' in df_html.columns:
1079
- df_html['Target FASTA'] = df_html['Target FASTA'].swifter.apply(
1080
  lambda x: wrap_text(x) if not pd.isna(x) else x)
1081
 
1082
  num_cols = df_html.select_dtypes('number').columns
@@ -1094,7 +1094,7 @@ def create_html_report(df, file=None, task=None, opts=(), progress=gr.Progress(t
1094
  if 'Target ID' in df_html.columns:
1095
  df_html.drop(['Target FASTA'], axis=1, inplace=True)
1096
  if 'Target FASTA' in df_html.columns:
1097
- df_html['Target FASTA'] = df_html['Target FASTA'].swifter.apply(
1098
  lambda x: wrap_text(x) if not pd.isna(x) else x)
1099
  if 'Scaffold SMILES' in df_html.columns:
1100
  df_html.drop(['Scaffold SMILES'], axis=1, inplace=True)
@@ -1159,7 +1159,9 @@ def create_html_report(df, file=None, task=None, opts=(), progress=gr.Progress(t
1159
 
1160
  report_table = pn.widgets.Tabulator(
1161
  df_html, formatters=formatters,
1162
- frozen_columns=['Index', 'Target ID', 'Compound ID', 'Compound', 'Scaffold'],
 
 
1163
  disabled=True, sizing_mode='stretch_both', pagination='local', page_size=30)
1164
 
1165
  for i, col in enumerate(num_cols):
@@ -1314,7 +1316,7 @@ def create_pie_chart(df, category, value, top_k):
1314
  ("Percentage", "@proportion{0.0%}")
1315
  ]
1316
 
1317
- if category == 'Scaffold SMILES':
1318
  data = data.merge(top_k_df[['Scaffold SMILES', 'Scaffold']].drop_duplicates(), how='left',
1319
  left_on='Scaffold SMILES', right_on='Scaffold SMILES')
1320
  tooltips.append(("Scaffold", "<div>@{Scaffold}{safe}</div>"))
@@ -1353,11 +1355,11 @@ def submit_report(df, score_list, filter_list, task, progress=gr.Progress(track_
1353
  df_report = df.copy()
1354
  try:
1355
  for filter_name in filter_list:
1356
- df_report[filter_name] = df_report['Compound'].swifter.apply(
1357
  lambda x: FILTER_MAP[filter_name](x) if not pd.isna(x) else x)
1358
 
1359
  for score_name in score_list:
1360
- df_report[score_name] = df_report['Compound'].swifter.apply(
1361
  lambda x: SCORE_MAP[score_name](x) if not pd.isna(x) else x)
1362
 
1363
  return (create_html_report(df_report, file=None, task=task), df_report,
@@ -1990,7 +1992,7 @@ with gr.Blocks(theme=theme, title='DeepSEQreen', css=CSS, delete_cache=(3600, 48
1990
  alignment = aligner.align(processed_fasta, query)
1991
  return alignment.score / max(len(processed_fasta), len(query))
1992
 
1993
- alignment_df['score'] = alignment_df['X2'].swifter.apply(align_score)
1994
  row = alignment_df.loc[alignment_df['score'].idxmax()]
1995
  family = str(row['Target Family']).title()
1996
  return gr.Dropdown(value=family,
@@ -2316,13 +2318,13 @@ QALAHAYFAQYHDPDDEPVADPYDQSFESRDLLIDEWKSLTYDEVISFVPPPLDQEEMES
2316
  infer_df = pd.read_csv(drug_target_pair_upload)
2317
  validate_columns(infer_df, ['X1', 'X2'])
2318
 
2319
- infer_df['X1_ERR'] = infer_df['X1'].swifter.apply(
2320
  validate_seq_str, regex=SMILES_PAT)
2321
  if not infer_df['X1_ERR'].isna().all():
2322
  raise ValueError(
2323
  f"Encountered invalid SMILES:\n{infer_df[~infer_df['X1_ERR'].isna()][['X1', 'X1_ERR']]}")
2324
 
2325
- infer_df['X2_ERR'] = infer_df['X2'].swifter.apply(
2326
  validate_seq_str, regex=FASTA_PAT)
2327
  if not infer_df['X2_ERR'].isna().all():
2328
  raise ValueError(
@@ -2546,6 +2548,8 @@ QALAHAYFAQYHDPDDEPVADPYDQSFESRDLLIDEWKSLTYDEVISFVPPPLDQEEMES
2546
  info=f'Found {label} in your uploaded dataset. '
2547
  'Is it compound-protein interaction or binding affinity?'),
2548
  html_report: ''}
 
 
2549
 
2550
 
2551
  report_df_change = file_for_report.change(
@@ -2562,7 +2566,7 @@ QALAHAYFAQYHDPDDEPVADPYDQSFESRDLLIDEWKSLTYDEVISFVPPPLDQEEMES
2562
  concurrency_limit=100,
2563
  ).success(
2564
  fn=inquire_task, inputs=[raw_df],
2565
- outputs=[report_task, html_report, analyze_btn, csv_generate, html_generate],
2566
  )
2567
 
2568
  file_for_report.clear(
 
43
  from apscheduler.schedulers.background import BackgroundScheduler
44
  from tinydb import TinyDB, Query
45
 
46
+ # import swifter
47
  from tqdm.auto import tqdm
48
 
49
  from deepscreen.data.dti import validate_seq_str, rdkit_canonicalize, FASTA_PAT, SMILES_PAT
 
786
  orig_df['Target Family'] = None
787
  if orig_df['Target Family'].isna().any():
788
  orig_df.loc[orig_df['Target Family'].isna(), 'Target Family'] = (
789
+ orig_df.loc[orig_df['Target Family'].isna(), 'X2'].parallel_apply(detect_family)
790
  )
791
  detect_family.cache_clear()
792
 
 
885
  if "Include Compound Max. Tanimoto Similarity to Training Compounds" in opts:
886
  for family in prediction_df['Target Family'].unique():
887
  family_smiles_df = get_seen_smiles(family=family, task=task_value)
888
+ family_smiles_df['FP'] = family_smiles_df['X1'].parallel_apply(smiles_to_ecfp)
889
 
890
  @cache
891
  def max_sim(smi):
 
893
 
894
  prediction_df.loc[
895
  prediction_df['Target Family'] == family, 'Max. Tanimoto Similarity to Training Compounds'] = (
896
+ prediction_df.loc[prediction_df['Target Family'] == family, 'X1'].parallel_apply(max_sim)
897
  )
898
  max_sim.cache_clear()
899
 
 
907
  return max_tanimoto_similarity(smiles, seen_smiles_with_fp=pos_compounds_df)
908
 
909
  prediction_df[['Max. Tanimoto Similarity', 'Max. Tanimoto Similarity Compound']] = (
910
+ prediction_df['X1'].parallel_apply(max_sim).apply(pd.Series)
911
  )
912
  max_sim.cache_clear()
913
 
914
  if "Include Target Max. Sequence Identity to Known Interacting Targets of Compound" in opts:
915
  x2 = prediction_df['X2'].iloc[0]
916
+ prediction_df['X1^'] = prediction_df['X1'].parallel_apply(rdkit_canonicalize)
917
 
918
  @cache
919
  def calculate_max_sequence_identity(compound):
 
921
  return max_sequence_identity(x2, seen_fastas=compound_targets)
922
 
923
  prediction_df[['Max. Sequence Identity', 'Max. Sequence Identity Target']] = (
924
+ prediction_df['X1^'].parallel_apply(calculate_max_sequence_identity).apply(pd.Series)
925
  )
926
  prediction_df.drop(['X1^'], axis=1, inplace=True)
927
 
 
937
 
938
  prediction_df.loc[
939
  prediction_df['Target Family'] == family, 'Max. Sequence Identity to Training Targets'] = (
940
+ prediction_df.loc[prediction_df['Target Family'] == family, 'X2'].parallel_apply(max_id)
941
  )
942
  max_id.cache_clear()
943
 
 
991
 
992
  if 'X1' in df.columns:
993
  if 'Compound' not in df.columns or df['Compound'].dtype != 'object':
994
+ df['Compound'] = df['X1'].parallel_apply(
995
  lambda smiles: PandasTools._MolPlusFingerprint(Chem.MolFromSmiles(smiles)))
996
+ df['Scaffold'] = df['Compound'].parallel_apply(MurckoScaffold.GetScaffoldForMol)
997
+ df['Scaffold SMILES'] = df['Scaffold'].parallel_apply(lambda x: Chem.MolToSmiles(x))
998
 
999
  if task == 'Compound-Protein Binding Affinity':
1000
  # Convert Y^ from pIC50 to IC50
 
1040
  columns_unique = None
1041
 
1042
  if 'Compound' in df_html.columns and 'Exclude Molecular Graph' not in opts:
1043
+ df_html['Compound'] = df_html['Compound'].parallel_apply(
1044
  lambda x: PandasTools.PrintAsImageString(x) if not pd.isna(x) else x)
1045
  else:
1046
  df_html.drop(['Compound'], axis=1, inplace=True)
1047
 
1048
  if 'Scaffold' in df_html.columns and 'Exclude Scaffold Graph' not in opts:
1049
+ df_html['Scaffold'] = df_html['Scaffold'].parallel_apply(
1050
  lambda x: PandasTools.PrintAsImageString(x) if not pd.isna(x) else x)
1051
  else:
1052
  df_html.drop(['Scaffold'], axis=1, inplace=True)
 
1076
  df_html.rename(columns=column_aliases, inplace=True)
1077
  df_html.index.name = 'Index'
1078
  if 'Target FASTA' in df_html.columns:
1079
+ df_html['Target FASTA'] = df_html['Target FASTA'].parallel_apply(
1080
  lambda x: wrap_text(x) if not pd.isna(x) else x)
1081
 
1082
  num_cols = df_html.select_dtypes('number').columns
 
1094
  if 'Target ID' in df_html.columns:
1095
  df_html.drop(['Target FASTA'], axis=1, inplace=True)
1096
  if 'Target FASTA' in df_html.columns:
1097
+ df_html['Target FASTA'] = df_html['Target FASTA'].parallel_apply(
1098
  lambda x: wrap_text(x) if not pd.isna(x) else x)
1099
  if 'Scaffold SMILES' in df_html.columns:
1100
  df_html.drop(['Scaffold SMILES'], axis=1, inplace=True)
 
1159
 
1160
  report_table = pn.widgets.Tabulator(
1161
  df_html, formatters=formatters,
1162
+ frozen_columns=[col for col in df_html.columns if col in [
1163
+ 'Index', 'Target ID', 'Compound ID', 'Compound', 'Scaffold'
1164
+ ]],
1165
  disabled=True, sizing_mode='stretch_both', pagination='local', page_size=30)
1166
 
1167
  for i, col in enumerate(num_cols):
 
1316
  ("Percentage", "@proportion{0.0%}")
1317
  ]
1318
 
1319
+ if category == 'Scaffold SMILES' and 'Scaffold' in df.columns:
1320
  data = data.merge(top_k_df[['Scaffold SMILES', 'Scaffold']].drop_duplicates(), how='left',
1321
  left_on='Scaffold SMILES', right_on='Scaffold SMILES')
1322
  tooltips.append(("Scaffold", "<div>@{Scaffold}{safe}</div>"))
 
1355
  df_report = df.copy()
1356
  try:
1357
  for filter_name in filter_list:
1358
+ df_report[filter_name] = df_report['Compound'].parallel_apply(
1359
  lambda x: FILTER_MAP[filter_name](x) if not pd.isna(x) else x)
1360
 
1361
  for score_name in score_list:
1362
+ df_report[score_name] = df_report['Compound'].parallel_apply(
1363
  lambda x: SCORE_MAP[score_name](x) if not pd.isna(x) else x)
1364
 
1365
  return (create_html_report(df_report, file=None, task=task), df_report,
 
1992
  alignment = aligner.align(processed_fasta, query)
1993
  return alignment.score / max(len(processed_fasta), len(query))
1994
 
1995
+ alignment_df['score'] = alignment_df['X2'].parallel_apply(align_score)
1996
  row = alignment_df.loc[alignment_df['score'].idxmax()]
1997
  family = str(row['Target Family']).title()
1998
  return gr.Dropdown(value=family,
 
2318
  infer_df = pd.read_csv(drug_target_pair_upload)
2319
  validate_columns(infer_df, ['X1', 'X2'])
2320
 
2321
+ infer_df['X1_ERR'] = infer_df['X1'].parallel_apply(
2322
  validate_seq_str, regex=SMILES_PAT)
2323
  if not infer_df['X1_ERR'].isna().all():
2324
  raise ValueError(
2325
  f"Encountered invalid SMILES:\n{infer_df[~infer_df['X1_ERR'].isna()][['X1', 'X1_ERR']]}")
2326
 
2327
+ infer_df['X2_ERR'] = infer_df['X2'].parallel_apply(
2328
  validate_seq_str, regex=FASTA_PAT)
2329
  if not infer_df['X2_ERR'].isna().all():
2330
  raise ValueError(
 
2548
  info=f'Found {label} in your uploaded dataset. '
2549
  'Is it compound-protein interaction or binding affinity?'),
2550
  html_report: ''}
2551
+ else:
2552
+ return {report_task: gr.Dropdown(visible=False)}
2553
 
2554
 
2555
  report_df_change = file_for_report.change(
 
2566
  concurrency_limit=100,
2567
  ).success(
2568
  fn=inquire_task, inputs=[raw_df],
2569
+ outputs=[report_task, html_report],
2570
  )
2571
 
2572
  file_for_report.clear(