libokj commited on
Commit
e9baae7
·
verified ·
1 Parent(s): efa49ac

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +74 -72
app.py CHANGED
@@ -6,7 +6,7 @@ import textwrap
6
  from email.mime.multipart import MIMEMultipart
7
  from email.mime.text import MIMEText
8
  from email.utils import formatdate, make_msgid
9
- from functools import cache
10
  from math import pi
11
  from time import sleep, time
12
  from uuid import uuid4
@@ -25,6 +25,7 @@ import hydra
25
  import pandas as pd
26
  from pandarallel import pandarallel
27
  import requests
 
28
  from requests.adapters import HTTPAdapter, Retry
29
  from markdown import markdown
30
  from rdkit import Chem, DataStructs
@@ -291,29 +292,24 @@ def check_expiry():
291
  send_email(job)
292
 
293
 
294
- @cache
295
- def max_tanimoto_similarity(smi, seen_smiles):
296
  if smi is None:
297
  return 0
 
 
298
  mol = Chem.MolFromSmiles(smi)
299
  if mol is None:
300
  return 0
301
  mol_ecfp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048)
302
- max_sim = 0
303
- for smiles in seen_smiles:
304
- mol_seen = Chem.MolFromSmiles(smiles)
305
- mol_seen_ecfp = AllChem.GetMorganFingerprintAsBitVect(mol_seen, radius=2, nBits=2048)
306
- sim = DataStructs.TanimotoSimilarity(mol_ecfp, mol_seen_ecfp)
307
- if sim == 1:
308
- return 1
309
- max_sim = max(sim, max_sim)
310
- return max_sim
311
 
312
 
313
- @cache
314
  def max_sequence_identity(seq, seen_fastas):
315
  if seq is None:
316
  return 0
 
 
317
  aligner = PairwiseAligner()
318
  aligner.mode = 'local'
319
  max_id = 0
@@ -328,16 +324,24 @@ def max_sequence_identity(seq, seen_fastas):
328
 
329
  @cache
330
  def get_seen_smiles(family, task):
 
 
 
 
331
  seen_smiles = pd.read_csv(
332
- f'data/benchmarks/seen_compounds/{TARGET_FAMILY_MAP[family.title()]}_{task.lower()}_random_split.csv')
333
- return seen_smiles['X1'].tolist()
334
 
335
 
336
  @cache
337
  def get_seen_fastas(family, task):
 
 
 
 
338
  seen_fastas = pd.read_csv(
339
- f'data/benchmarks/seen_targets/{TARGET_FAMILY_MAP[family.title()]}_{task.lower()}_random_split.csv')
340
- return seen_fastas['X2'].tolist()
341
 
342
 
343
  @cache
@@ -709,7 +713,6 @@ def submit_predict(predict_filepath, task, preset, target_family, opts, state):
709
  error = None
710
  task_file_abbr = {'Compound-Protein Interaction': 'CPI', 'Compound-Protein Binding Affinity': 'CPA'}
711
  predictions_file = None
712
-
713
  df_training = pd.read_csv(f'data/complete_{TASK_MAP[task].lower()}_dataset.csv')
714
  orig_df = pd.read_csv(predict_filepath)
715
  alignment_df = get_fasta_family_map()
@@ -737,12 +740,9 @@ def submit_predict(predict_filepath, task, preset, target_family, opts, state):
737
  if 'Target Family' not in orig_df.columns:
738
  orig_df['Target Family'] = None
739
  if orig_df['Target Family'].isna().any():
740
- orig_df.loc[
741
- orig_df['Target Family'].isna(), 'Target Family'
742
- ] = orig_df.loc[
743
- orig_df['Target Family'].isna(), 'X2'
744
- ].parallel_apply(detect_family)
745
-
746
  detect_family.cache_clear()
747
 
748
  orig_df = orig_df.merge(df_training[['X1', 'X2', 'Y']], on=['X1', 'X2'], how='left', indicator=False)
@@ -783,76 +783,82 @@ def submit_predict(predict_filepath, task, preset, target_family, opts, state):
783
  prediction_df = pd.concat([prediction_df, predictions])
784
 
785
  else:
786
- predictions_file = f'{SERVER_DATA_DIR}/{job_id}_{task_file_abbr[task]}_{preset}_auto_predictions.csv'
787
  task_value = TASK_MAP[task]
788
  score = TASK_METRIC_MAP[task]
789
  benchmark_df = pd.read_csv(f'data/benchmarks/{task_value}_test_metrics.csv')
790
  predict_df = pd.read_csv(predict_filepath)
791
 
792
  for family, subset in predict_df.groupby('Target Family'):
793
- predict_subset_filepath = f'{SERVER_DATA_DIR}/{job_id}_{family}_input.csv'
 
 
794
  subset.to_csv(predict_subset_filepath, index=False, na_rep='')
795
- seen_compounds = get_seen_smiles(family, task_value)
796
 
 
797
  if subset['X1'].iloc[0] in seen_compounds:
798
  scenario = "Seen Compound"
799
  else:
800
  scenario = "Unseen Compound"
801
 
802
  filtered_df = benchmark_df[(benchmark_df['Family'] == family.title())
803
- & (benchmark_df['Scenario'] == scenario)]
 
804
 
805
- preset = filtered_df.loc[filtered_df[score].idxmax(), 'Model']
806
- preset_value = PRESET_MAP[preset]
 
 
 
 
 
 
 
 
 
 
807
 
808
- target_family = TARGET_FAMILY_MAP[family.title()]
 
 
809
  cfg = hydra.compose(
810
  config_name="webserver_inference",
811
  overrides=[f"task={task_value}",
812
  f"preset={preset_value}",
813
- # f"ckpt_path=D:/checkpoints/{preset_value}-{task_value}-{target_family}.ckpt",
814
  f"ckpt_path=resources/checkpoints/{preset_value}-{task_value}-{target_family}.ckpt",
815
  f"data.data_file='{str(predict_subset_filepath)}'"])
816
 
817
  predictions, _ = predict(cfg)
818
  predictions = pd.concat([pd.DataFrame(prediction) for prediction in predictions], ignore_index=True)
819
- predictions['Source'] = f'Predicted ({preset} {family})'
 
820
  prediction_df = pd.concat([prediction_df, predictions])
821
 
822
  prediction_df = prediction_df.merge(orig_df, on=['X1', 'X2'], how='left', indicator=False)
823
  prediction_df = pd.concat([prediction_df, annotated_df], ignore_index=True)
824
 
825
- # prediction_df['Max. Tanimoto Similarity'] = prediction_df.groupby('Target Family')['X1'].apply(
826
- # lambda group: group.parallel_apply(
827
- # max_tanimoto_similarity,
828
- # seen_smiles=tuple(get_seen_smiles(family=group.name, task=task_value))
829
- # )
830
- # ).values
831
- #
832
- # prediction_df['Max. Sequence Identity'] = prediction_df.groupby('Target Family')['X2'].apply(
833
- # lambda group: group.parallel_apply(
834
- # max_sequence_identity,
835
- # seen_fastas=tuple(get_seen_fastas(family=group.name, task=task_value))
836
- # )
837
- # ).values
838
  if "Include Max. Tanimoto Similarity" in opts:
839
  for family in prediction_df['Target Family'].unique():
840
- prediction_df.loc[
841
- prediction_df['Target Family'] == family, 'Max. Tanimoto Similarity'] = prediction_df.loc[
842
- prediction_df['Target Family'] == family, 'X1'].parallel_apply(
843
- max_tanimoto_similarity,
844
- seen_smiles=tuple(get_seen_smiles(family=family, task=task_value))
 
 
 
845
  )
846
- max_tanimoto_similarity.cache_clear()
 
847
  if "Include Max. Sequence Identity" in opts:
848
  for family in prediction_df['Target Family'].unique():
849
- prediction_df.loc[
850
- prediction_df['Target Family'] == family, 'Max. Sequence Identity'] = prediction_df.loc[
851
- prediction_df['Target Family'] == family, 'X2'].parallel_apply(
852
- max_sequence_identity,
853
- seen_fastas=tuple(get_seen_fastas(family=family, task=task_value))
854
  )
855
- max_sequence_identity.cache_clear()
 
856
  prediction_df.drop(['N'], axis=1).to_csv(predictions_file, index=False, na_rep='')
857
  status = "COMPLETED"
858
 
@@ -1968,9 +1974,8 @@ QALAHAYFAQYHDPDDEPVADPYDQSFESRDLLIDEWKSLTYDEVISFVPPPLDQEEMES
1968
  return [None, family]
1969
 
1970
  if family == 'General':
1971
- seen_targets = pd.read_csv(
1972
- f'data/benchmarks/seen_targets/all_families_full_{task.lower()}_random_split.csv')
1973
- if process_target_fasta(fasta) in seen_targets['X2'].values:
1974
  scenario = "Seen Target"
1975
  else:
1976
  scenario = "Unseen Target"
@@ -1979,16 +1984,14 @@ QALAHAYFAQYHDPDDEPVADPYDQSFESRDLLIDEWKSLTYDEVISFVPPPLDQEEMES
1979
  & (benchmark_df['Type'] == 'General')]
1980
 
1981
  else:
1982
- seen_targets_general = pd.read_csv(
1983
- f'data/benchmarks/seen_targets/all_families_full_{task.lower()}_random_split.csv')
1984
- if process_target_fasta(fasta) in seen_targets_general['X2'].values:
1985
  scenario_general = "Seen Target"
1986
  else:
1987
  scenario_general = "Unseen Target"
1988
 
1989
- seen_targets_family = pd.read_csv(
1990
- f'data/benchmarks/seen_targets/{TARGET_FAMILY_MAP[family.title()]}_{task.lower()}_random_split.csv')
1991
- if process_target_fasta(fasta) in seen_targets_family['X2'].values:
1992
  scenario_family = "Seen Target"
1993
  else:
1994
  scenario_family = "Unseen Target"
@@ -2008,10 +2011,9 @@ QALAHAYFAQYHDPDDEPVADPYDQSFESRDLLIDEWKSLTYDEVISFVPPPLDQEEMES
2008
  scenario = "Unseen Target (<0.85 sequence identity)"
2009
 
2010
  return {drug_screen_preset:
2011
- gr.Dropdown(value=row['Model'],
2012
- info=f"Reason: {row['Scenario']} in training; we recommend the {row['Type']}-trained "
2013
- f"model with the best {score} in the {scenario} scenario "
2014
- f"on {row['Family']}."),
2015
  drug_screen_target_family:
2016
  gr.Dropdown(value='General') if row['Type'] == 'General' else gr.Dropdown(value=family)}
2017
 
@@ -2569,4 +2571,4 @@ if __name__ == "__main__":
2569
  hydra.initialize(version_base="1.3", config_path="configs", job_name="webserver_inference")
2570
  demo.queue(default_concurrency_limit=None, max_size=10).launch(show_api=False)
2571
  scheduler.add_job(check_expiry, 'interval', hours=1)
2572
- scheduler.start()
 
6
  from email.mime.multipart import MIMEMultipart
7
  from email.mime.text import MIMEText
8
  from email.utils import formatdate, make_msgid
9
+ from functools import cache, partial
10
  from math import pi
11
  from time import sleep, time
12
  from uuid import uuid4
 
25
  import pandas as pd
26
  from pandarallel import pandarallel
27
  import requests
28
+ from rdkit.DataStructs import BulkTanimotoSimilarity
29
  from requests.adapters import HTTPAdapter, Retry
30
  from markdown import markdown
31
  from rdkit import Chem, DataStructs
 
292
  send_email(job)
293
 
294
 
295
+ def max_tanimoto_similarity(smi, seen_smiles_with_fp):
 
296
  if smi is None:
297
  return 0
298
+ if smi in seen_smiles_with_fp['X1'].values:
299
+ return 1
300
  mol = Chem.MolFromSmiles(smi)
301
  if mol is None:
302
  return 0
303
  mol_ecfp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048)
304
+ sims = BulkTanimotoSimilarity(mol_ecfp, seen_smiles_with_fp['FP'])
305
+ return max(sims)
 
 
 
 
 
 
 
306
 
307
 
 
308
  def max_sequence_identity(seq, seen_fastas):
309
  if seq is None:
310
  return 0
311
+ if seq in seen_fastas:
312
+ return 1
313
  aligner = PairwiseAligner()
314
  aligner.mode = 'local'
315
  max_id = 0
 
324
 
325
  @cache
326
  def get_seen_smiles(family, task):
327
+ if family == 'General':
328
+ family = 'all_families_full'
329
+ else:
330
+ family = TARGET_FAMILY_MAP[family.title()]
331
  seen_smiles = pd.read_csv(
332
+ f'data/benchmarks/seen_compounds/{family}_{task.lower()}_random_split.csv')
333
+ return seen_smiles
334
 
335
 
336
  @cache
337
  def get_seen_fastas(family, task):
338
+ if family == 'General':
339
+ family = 'all_families_full'
340
+ else:
341
+ family = TARGET_FAMILY_MAP[family.title()]
342
  seen_fastas = pd.read_csv(
343
+ f'data/benchmarks/seen_targets/{family}_{task.lower()}_random_split.csv')
344
+ return seen_fastas
345
 
346
 
347
  @cache
 
713
  error = None
714
  task_file_abbr = {'Compound-Protein Interaction': 'CPI', 'Compound-Protein Binding Affinity': 'CPA'}
715
  predictions_file = None
 
716
  df_training = pd.read_csv(f'data/complete_{TASK_MAP[task].lower()}_dataset.csv')
717
  orig_df = pd.read_csv(predict_filepath)
718
  alignment_df = get_fasta_family_map()
 
740
  if 'Target Family' not in orig_df.columns:
741
  orig_df['Target Family'] = None
742
  if orig_df['Target Family'].isna().any():
743
+ orig_df.loc[orig_df['Target Family'].isna(), 'Target Family'] = (
744
+ orig_df.loc[orig_df['Target Family'].isna(), 'X2'].parallel_apply(detect_family)
745
+ )
 
 
 
746
  detect_family.cache_clear()
747
 
748
  orig_df = orig_df.merge(df_training[['X1', 'X2', 'Y']], on=['X1', 'X2'], how='left', indicator=False)
 
783
  prediction_df = pd.concat([prediction_df, predictions])
784
 
785
  else:
786
+ predictions_file = f'{SERVER_DATA_DIR}/{job_id}_{task_file_abbr[task]}_family-recommended_predictions.csv'
787
  task_value = TASK_MAP[task]
788
  score = TASK_METRIC_MAP[task]
789
  benchmark_df = pd.read_csv(f'data/benchmarks/{task_value}_test_metrics.csv')
790
  predict_df = pd.read_csv(predict_filepath)
791
 
792
  for family, subset in predict_df.groupby('Target Family'):
793
+ predict_subset_filepath = os.path.join(
794
+ os.path.dirname(predict_filepath), f'{job_id}_{family}_input.csv'
795
+ )
796
  subset.to_csv(predict_subset_filepath, index=False, na_rep='')
 
797
 
798
+ seen_compounds = get_seen_smiles(family, task_value)['X1'].values
799
  if subset['X1'].iloc[0] in seen_compounds:
800
  scenario = "Seen Compound"
801
  else:
802
  scenario = "Unseen Compound"
803
 
804
  filtered_df = benchmark_df[(benchmark_df['Family'] == family.title())
805
+ & (benchmark_df['Scenario'] == scenario)
806
+ & (benchmark_df['Type'] == 'Family')]
807
 
808
+ seen_compounds = get_seen_smiles('General', task_value)['X1'].values
809
+ if subset['X1'].iloc[0] in seen_compounds:
810
+ scenario = "Seen Compound"
811
+ else:
812
+ scenario = "Unseen Compound"
813
+
814
+ filtered_df = pd.concat([
815
+ filtered_df,
816
+ benchmark_df[(benchmark_df['Family'] == family.title())
817
+ & (benchmark_df['Scenario'] == scenario)
818
+ & (benchmark_df['Type'] == 'General')]
819
+ ])
820
 
821
+ row = filtered_df.loc[filtered_df[score].idxmax()]
822
+ preset_value = PRESET_MAP[row['Model']]
823
+ target_family = TARGET_FAMILY_MAP[family.title()] if row['Type'] == 'Family' else 'general'
824
  cfg = hydra.compose(
825
  config_name="webserver_inference",
826
  overrides=[f"task={task_value}",
827
  f"preset={preset_value}",
 
828
  f"ckpt_path=resources/checkpoints/{preset_value}-{task_value}-{target_family}.ckpt",
829
  f"data.data_file='{str(predict_subset_filepath)}'"])
830
 
831
  predictions, _ = predict(cfg)
832
  predictions = pd.concat([pd.DataFrame(prediction) for prediction in predictions], ignore_index=True)
833
+ predictions['Source'] = (f'Predicted ({row["Model"]} '
834
+ f'{family.title() if row["Type"] == "Family" else "General"})')
835
  prediction_df = pd.concat([prediction_df, predictions])
836
 
837
  prediction_df = prediction_df.merge(orig_df, on=['X1', 'X2'], how='left', indicator=False)
838
  prediction_df = pd.concat([prediction_df, annotated_df], ignore_index=True)
839
 
 
 
 
 
 
 
 
 
 
 
 
 
 
840
  if "Include Max. Tanimoto Similarity" in opts:
841
  for family in prediction_df['Target Family'].unique():
842
+ family_smiles_df = get_seen_smiles(family=family, task=task_value)
843
+ family_smiles_df['FP'] = family_smiles_df['X1'].parallel_apply(
844
+ lambda smiles: AllChem.GetMorganFingerprintAsBitVect(
845
+ Chem.MolFromSmiles(smiles), radius=2, nBits=2048)
846
+ )
847
+ max_sim = cache(partial(max_tanimoto_similarity, seen_smiles_with_fp=family_smiles_df))
848
+ prediction_df.loc[prediction_df['Target Family'] == family, 'Max. Tanimoto Similarity'] = (
849
+ prediction_df.loc[prediction_df['Target Family'] == family, 'X1'].parallel_apply(max_sim)
850
  )
851
+ max_sim.cache_clear()
852
+
853
  if "Include Max. Sequence Identity" in opts:
854
  for family in prediction_df['Target Family'].unique():
855
+ family_fastas_df = get_seen_fastas(family=family, task=task_value)
856
+ max_id = cache(partial(max_sequence_identity, seen_fastas=family_fastas_df['X2'].values))
857
+ prediction_df.loc[prediction_df['Target Family'] == family, 'Max. Sequence Identity'] = (
858
+ prediction_df.loc[prediction_df['Target Family'] == family, 'X2'].parallel_apply(max_id)
 
859
  )
860
+ max_id.cache_clear()
861
+
862
  prediction_df.drop(['N'], axis=1).to_csv(predictions_file, index=False, na_rep='')
863
  status = "COMPLETED"
864
 
 
1974
  return [None, family]
1975
 
1976
  if family == 'General':
1977
+ seen_targets = get_seen_fastas('General', task)['X2'].values
1978
+ if process_target_fasta(fasta) in seen_targets:
 
1979
  scenario = "Seen Target"
1980
  else:
1981
  scenario = "Unseen Target"
 
1984
  & (benchmark_df['Type'] == 'General')]
1985
 
1986
  else:
1987
+ seen_targets_general = get_seen_fastas('General', task)['X2'].values
1988
+ if process_target_fasta(fasta) in seen_targets_general:
 
1989
  scenario_general = "Seen Target"
1990
  else:
1991
  scenario_general = "Unseen Target"
1992
 
1993
+ seen_targets_family = get_seen_fastas(family, task)['X2'].values
1994
+ if process_target_fasta(fasta) in seen_targets_family:
 
1995
  scenario_family = "Seen Target"
1996
  else:
1997
  scenario_family = "Unseen Target"
 
2011
  scenario = "Unseen Target (<0.85 sequence identity)"
2012
 
2013
  return {drug_screen_preset:
2014
+ gr.Dropdown(value=row['Model'],
2015
+ info=f"Reason: {row['Scenario']} in training; we recommend the {row['Type']}-trained "
2016
+ f"model with the best {score} in the {scenario} scenario on {row['Family']}."),
 
2017
  drug_screen_target_family:
2018
  gr.Dropdown(value='General') if row['Type'] == 'General' else gr.Dropdown(value=family)}
2019
 
 
2571
  hydra.initialize(version_base="1.3", config_path="configs", job_name="webserver_inference")
2572
  demo.queue(default_concurrency_limit=None, max_size=10).launch(show_api=False)
2573
  scheduler.add_job(check_expiry, 'interval', hours=1)
2574
+ scheduler.start()