libokj commited on
Commit
db33be0
·
verified ·
1 Parent(s): 15b93be

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +124 -36
app.py CHANGED
@@ -278,36 +278,70 @@ def check_expiry():
278
 
279
 
280
  def max_tanimoto_similarity(smi, seen_smiles_with_fp):
281
- if smi is None:
282
- return 0
 
283
  if smi in seen_smiles_with_fp['X1'].values:
284
- return 1
 
 
 
 
 
 
285
  mol = Chem.MolFromSmiles(smi)
286
  if mol is None:
287
- return 0
 
288
  mol_ecfp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048)
289
  sims = BulkTanimotoSimilarity(mol_ecfp, seen_smiles_with_fp['FP'])
290
- return max(sims)
 
 
 
 
 
 
 
291
 
292
 
293
  def max_sequence_identity(seq, seen_fastas):
294
- if seq is None:
295
- return 0
296
- if seq in seen_fastas:
297
- return 1
 
 
 
 
 
 
 
298
  aligner = PairwiseAligner()
299
  aligner.mode = 'local'
300
- max_id = 0
301
- for fasta in seen_fastas:
 
302
  alignment = aligner.align(seq, fasta)
303
  identity = alignment.score / max(len(seq), len(fasta))
304
  if identity == 1:
305
- return 1
306
- max_id = max(identity, max_id)
307
- return max_id
 
 
 
 
 
 
 
 
 
 
 
 
308
 
309
 
310
- @cache
311
  def get_seen_smiles(family, task):
312
  if family == 'General':
313
  family = 'all_families_full'
@@ -318,7 +352,6 @@ def get_seen_smiles(family, task):
318
  return seen_smiles
319
 
320
 
321
- @cache
322
  def get_seen_fastas(family, task):
323
  if family == 'General':
324
  family = 'all_families_full'
@@ -717,7 +750,8 @@ def submit_predict(predict_filepath, task, preset, target_family, opts, job_info
717
  return row['Target Family']
718
  # If no exact match, then calculate alignment score
719
  else:
720
- aligner = PairwiseAligner(mode='local')
 
721
 
722
  def align_score(target):
723
  alignment = aligner.align(query, target)
@@ -827,24 +861,62 @@ def submit_predict(predict_filepath, task, preset, target_family, opts, job_info
827
  prediction_df = prediction_df.merge(orig_df, on=['X1', 'X2'], how='left', indicator=False)
828
  prediction_df = pd.concat([prediction_df, annotated_df], ignore_index=True)
829
 
830
- if "Include Max. Tanimoto Similarity" in opts:
831
  for family in prediction_df['Target Family'].unique():
832
  family_smiles_df = get_seen_smiles(family=family, task=task_value)
833
  family_smiles_df['FP'] = family_smiles_df['X1'].parallel_apply(
834
  lambda smiles: AllChem.GetMorganFingerprintAsBitVect(
835
  Chem.MolFromSmiles(smiles), radius=2, nBits=2048)
836
  )
837
- max_sim = cache(partial(max_tanimoto_similarity, seen_smiles_with_fp=family_smiles_df))
838
- prediction_df.loc[prediction_df['Target Family'] == family, 'Max. Tanimoto Similarity'] = (
 
 
 
 
 
839
  prediction_df.loc[prediction_df['Target Family'] == family, 'X1'].parallel_apply(max_sim)
840
  )
841
  max_sim.cache_clear()
842
 
843
- if "Include Max. Sequence Identity" in opts:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
844
  for family in prediction_df['Target Family'].unique():
845
  family_fastas_df = get_seen_fastas(family=family, task=task_value)
846
- max_id = cache(partial(max_sequence_identity, seen_fastas=family_fastas_df['X2'].values))
847
- prediction_df.loc[prediction_df['Target Family'] == family, 'Max. Sequence Identity'] = (
 
 
 
 
 
848
  prediction_df.loc[prediction_df['Target Family'] == family, 'X2'].parallel_apply(max_id)
849
  )
850
  max_id.cache_clear()
@@ -1499,10 +1571,18 @@ with gr.Blocks(theme=theme, title='DeepSEQreen', css=CSS, delete_cache=(3600, 48
1499
  drug_library_upload = gr.File(label='Custom compound library file', visible=False)
1500
 
1501
  drug_screen_opts = gr.CheckboxGroup(
1502
- ['Include Max. Tanimoto Similarity'],
1503
- label='Step 6. Select Additional Options',
1504
- info="Calculating the maximum Tanimoto similarity of the library compounds to the "
1505
- "training dataset is an experimental feature and may take a considerable amount of time."
 
 
 
 
 
 
 
 
1506
  )
1507
  with gr.Row():
1508
  with gr.Column():
@@ -1516,7 +1596,6 @@ with gr.Blocks(theme=theme, title='DeepSEQreen', css=CSS, delete_cache=(3600, 48
1516
  with gr.Row():
1517
  drug_screen_clr_btn = gr.ClearButton(size='lg')
1518
  drug_screen_btn = gr.Button(value='SUBMIT THE SCREENING JOB', variant='primary', size='lg')
1519
- # TODO Modify the pd df directly with df['X2'] = target
1520
 
1521
  screen_data_for_predict = gr.File(visible=False, file_count="single", type='filepath')
1522
 
@@ -1606,10 +1685,11 @@ with gr.Blocks(theme=theme, title='DeepSEQreen', css=CSS, delete_cache=(3600, 48
1606
  target_library_upload = gr.File(label='Custom target library file', visible=False)
1607
 
1608
  target_identify_opts = gr.CheckboxGroup(
1609
- ['Include Max. Sequence Identity'],
1610
  label='Step 6. Select Additional Options',
1611
- info="Calculating the maximum sequence identity of the library protein to the "
1612
- "training dataset is an experimental feature and may take a considerable amount of time."
 
1613
  )
1614
  with gr.Row():
1615
  with gr.Column():
@@ -1934,6 +2014,14 @@ with gr.Blocks(theme=theme, title='DeepSEQreen', css=CSS, delete_cache=(3600, 48
1934
  x.name, gr.Dropdown(value=Path(x.name).name, choices=list(DRUG_LIBRARY_MAP.keys()) + [Path(x.name).name])
1935
  ], inputs=drug_library_upload_btn, outputs=[drug_library_upload, drug_library])
1936
 
 
 
 
 
 
 
 
 
1937
 
1938
  def example_fill(input_type):
1939
  return {target_id: 'Q16539',
@@ -2558,14 +2646,14 @@ QALAHAYFAQYHDPDDEPVADPYDQSFESRDLLIDEWKSLTYDEVISFVPPPLDQEEMES
2558
 
2559
  if __name__ == "__main__":
2560
  pandarallel.initialize()
2561
-
2562
  hydra.initialize(version_base="1.3", config_path="configs", job_name="webserver_inference")
2563
-
2564
  session = requests.Session()
2565
  ADAPTER = HTTPAdapter(max_retries=Retry(total=5, backoff_factor=0.1, status_forcelist=[500, 502, 503, 504]))
2566
  session.mount('http://', ADAPTER)
2567
  session.mount('https://', ADAPTER)
2568
-
2569
  db = TinyDB(f'{SERVER_DATA_DIR}/db.json')
2570
  # Set all RUNNING jobs to FAILED at TinyDB initialization
2571
  Job = Query()
@@ -2573,9 +2661,9 @@ if __name__ == "__main__":
2573
  for job in jobs:
2574
  if job['status'] == 'RUNNING':
2575
  db.update({'status': 'FAILED'}, Job.id == job['id'])
2576
-
2577
  scheduler = BackgroundScheduler()
2578
  scheduler.add_job(check_expiry, 'interval', hours=1)
2579
  scheduler.start()
2580
-
2581
  demo.queue(default_concurrency_limit=None, max_size=10).launch(show_api=False)
 
278
 
279
 
280
  def max_tanimoto_similarity(smi, seen_smiles_with_fp):
281
+ if smi is None or seen_smiles_with_fp is None or seen_smiles_with_fp.empty:
282
+ return {'Max. Tanimoto Similarity': 0, 'Max. Tanimoto Similarity Compound': None}
283
+
284
  if smi in seen_smiles_with_fp['X1'].values:
285
+ compound = smi
286
+ if 'ID1' in seen_smiles_with_fp.columns:
287
+ id1 = seen_smiles_with_fp.loc[seen_smiles_with_fp['X1'] == smi, 'ID1'].values[0]
288
+ if pd.notnull(id1) and id1 != '':
289
+ compound = id1
290
+ return {'Max. Tanimoto Similarity': 1, 'Max. Tanimoto Similarity Compound': compound}
291
+
292
  mol = Chem.MolFromSmiles(smi)
293
  if mol is None:
294
+ return {'Max. Tanimoto Similarity': 0, 'Max. Tanimoto Similarity Compound': None}
295
+
296
  mol_ecfp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048)
297
  sims = BulkTanimotoSimilarity(mol_ecfp, seen_smiles_with_fp['FP'])
298
+ idx = sims.argmax()
299
+ compound = seen_smiles_with_fp.iloc[idx]['X1']
300
+ if 'ID1' in seen_smiles_with_fp.columns:
301
+ id1 = seen_smiles_with_fp.iloc[idx]['ID1']
302
+ if pd.notnull(id1) and id1 != '':
303
+ compound = id1
304
+
305
+ return {'Max. Tanimoto Similarity': sims[idx], 'Max. Tanimoto Similarity Compound': compound}
306
 
307
 
308
  def max_sequence_identity(seq, seen_fastas):
309
+ if seq is None or seen_fastas is None or seen_fastas.empty:
310
+ return {'Max. Sequence Identity': 0, 'Max. Sequence Identity Target': None}
311
+
312
+ if seq in seen_fastas['X2'].values:
313
+ target = seq
314
+ if 'ID2' in seen_fastas.columns:
315
+ id2 = seen_fastas.loc[seen_fastas['X2'] == seq, 'ID2'].values[0]
316
+ if pd.notnull(id2) and id2 != '':
317
+ target = id2
318
+ return {'Max. Sequence Identity': 1, 'Max. Sequence Identity Target': target}
319
+
320
  aligner = PairwiseAligner()
321
  aligner.mode = 'local'
322
+ max_iden = 0
323
+ target = None
324
+ for fasta in seen_fastas['X2'].values:
325
  alignment = aligner.align(seq, fasta)
326
  identity = alignment.score / max(len(seq), len(fasta))
327
  if identity == 1:
328
+ target = fasta
329
+ if 'ID2' in seen_fastas.columns:
330
+ id2 = seen_fastas.loc[seen_fastas['X2'] == fasta, 'ID2'].values[0]
331
+ if pd.notnull(id2) and id2 != '':
332
+ target = id2
333
+ return {'Max. Sequence Identity': 1, 'Max. Sequence Identity Target': target}
334
+ if identity > max_iden:
335
+ max_iden = identity
336
+ target = fasta
337
+ if 'ID2' in seen_fastas.columns:
338
+ id2 = seen_fastas.loc[seen_fastas['X2'] == fasta, 'ID2'].values[0]
339
+ if pd.notnull(id2) and id2 != '':
340
+ target = id2
341
+
342
+ return {'Max. Sequence Identity': max_iden, 'Max. Sequence Identity Target': target}
343
 
344
 
 
345
  def get_seen_smiles(family, task):
346
  if family == 'General':
347
  family = 'all_families_full'
 
352
  return seen_smiles
353
 
354
 
 
355
  def get_seen_fastas(family, task):
356
  if family == 'General':
357
  family = 'all_families_full'
 
750
  return row['Target Family']
751
  # If no exact match, then calculate alignment score
752
  else:
753
+ aligner = PairwiseAligner()
754
+ aligner.mode = 'local'
755
 
756
  def align_score(target):
757
  alignment = aligner.align(query, target)
 
861
  prediction_df = prediction_df.merge(orig_df, on=['X1', 'X2'], how='left', indicator=False)
862
  prediction_df = pd.concat([prediction_df, annotated_df], ignore_index=True)
863
 
864
+ if "Include Compound Max. Tanimoto Similarity to Training Compounds" in opts:
865
  for family in prediction_df['Target Family'].unique():
866
  family_smiles_df = get_seen_smiles(family=family, task=task_value)
867
  family_smiles_df['FP'] = family_smiles_df['X1'].parallel_apply(
868
  lambda smiles: AllChem.GetMorganFingerprintAsBitVect(
869
  Chem.MolFromSmiles(smiles), radius=2, nBits=2048)
870
  )
871
+
872
+ @cache
873
+ def max_sim(smi):
874
+ return max_tanimoto_similarity(smi, family_smiles_df)['Max. Tanimoto Similarity']
875
+
876
+ prediction_df.loc[
877
+ prediction_df['Target Family'] == family, 'Max. Tanimoto Similarity to Training Compounds'] = (
878
  prediction_df.loc[prediction_df['Target Family'] == family, 'X1'].parallel_apply(max_sim)
879
  )
880
  max_sim.cache_clear()
881
 
882
+ if "Include Compound Max. Tanimoto Similarity to Known Interacting Compounds of Target" in opts:
883
+ x2 = prediction_df['X2'].iloc[0]
884
+ pos_compounds_df = df_training.loc[(df_training['X2'] == x2) & (df_training['Y'] == 1)]
885
+ pos_compounds_df['FP'] = pos_compounds_df['X1'].parallel_apply(
886
+ lambda smiles: AllChem.GetMorganFingerprintAsBitVect(
887
+ Chem.MolFromSmiles(smiles), radius=2, nBits=2048)
888
+ )
889
+ max_sim = cache(partial(max_tanimoto_similarity, seen_smiles_with_fp=pos_compounds_df))
890
+ prediction_df[['Max. Tanimoto Similarity', 'Max. Tanimoto Similarity Compound']] = (
891
+ prediction_df['X1'].parallel_apply(max_sim).apply(pd.Series)
892
+ )
893
+ max_sim.cache_clear()
894
+
895
+ if "Include Target Max. Sequence Identity to Known Interacting Targets of Compound" in opts:
896
+ x2 = prediction_df['X2'].iloc[0]
897
+ prediction_df['X1^'] = prediction_df['X1'].parallel_apply(rdkit_canonicalize)
898
+
899
+ @cache
900
+ def calculate_max_sequence_identity(compound):
901
+ compound_targets = df_training.loc[df_training['X1'] == compound]
902
+ return max_sequence_identity(x2, seen_fastas=compound_targets)
903
+
904
+ prediction_df[['Max. Sequence Identity', 'Max. Sequence Identity Target']] = (
905
+ prediction_df['X1^'].parallel_apply(calculate_max_sequence_identity).apply(pd.Series)
906
+ )
907
+
908
+ calculate_max_sequence_identity.cache_clear()
909
+
910
+ if "Include Target Max. Sequence Identity to Training Targets" in opts:
911
  for family in prediction_df['Target Family'].unique():
912
  family_fastas_df = get_seen_fastas(family=family, task=task_value)
913
+
914
+ @cache
915
+ def max_id(seq):
916
+ return max_sequence_identity(seq, seen_fastas=family_fastas_df)['Max. Sequence Identity']
917
+
918
+ prediction_df.loc[
919
+ prediction_df['Target Family'] == family, 'Max. Sequence Identity to Training Targets'] = (
920
  prediction_df.loc[prediction_df['Target Family'] == family, 'X2'].parallel_apply(max_id)
921
  )
922
  max_id.cache_clear()
 
1571
  drug_library_upload = gr.File(label='Custom compound library file', visible=False)
1572
 
1573
  drug_screen_opts = gr.CheckboxGroup(
1574
+ label="Step 6. Select Additional Options",
1575
+ choices=[
1576
+ 'Include Compound Max. Tanimoto Similarity to Training Compounds',
1577
+ 'Include Target Max. Sequence Identity to Known Interacting Targets of Compound',
1578
+ 'Include Compound Max. Tanimoto Similarity to Known Interacting Compounds of Target',
1579
+ ],
1580
+ info="These are experimental features and may increase the job computation time. "
1581
+ "Compound Max. Tanimoto Similarity to Training Compounds and "
1582
+ "Target Max. Sequence Identity to Known Interacting Targets of Compound "
1583
+ "are indicative of the predictive reliability of the model (the higher the more reliable), "
1584
+ "while Compound Max. Tanimoto Similarity to Known Interacting Compounds of Target "
1585
+ "is indicative of the novelty of the compound (the lower the more novel)."
1586
  )
1587
  with gr.Row():
1588
  with gr.Column():
 
1596
  with gr.Row():
1597
  drug_screen_clr_btn = gr.ClearButton(size='lg')
1598
  drug_screen_btn = gr.Button(value='SUBMIT THE SCREENING JOB', variant='primary', size='lg')
 
1599
 
1600
  screen_data_for_predict = gr.File(visible=False, file_count="single", type='filepath')
1601
 
 
1685
  target_library_upload = gr.File(label='Custom target library file', visible=False)
1686
 
1687
  target_identify_opts = gr.CheckboxGroup(
1688
+ ['Include Target Max. Sequence Identity to Training Targets'],
1689
  label='Step 6. Select Additional Options',
1690
+ info="These are experimental features and may increase the job computation time. "
1691
+ "Target Max. Sequence Identity to Training Targets is indicative of the "
1692
+ "predictive reliability of the model (the higher the more reliable)."
1693
  )
1694
  with gr.Row():
1695
  with gr.Column():
 
2014
  x.name, gr.Dropdown(value=Path(x.name).name, choices=list(DRUG_LIBRARY_MAP.keys()) + [Path(x.name).name])
2015
  ], inputs=drug_library_upload_btn, outputs=[drug_library_upload, drug_library])
2016
 
2017
+ drug_screen_task.select(
2018
+ fn=lambda task, opts: [opt for opt in opts if opt not in [
2019
+ 'Include Compound Max. Tanimoto Similarity to Known Interacting Compounds of Target',
2020
+ 'Include Target Max. Sequence Identity to Known Interacting Targets of Compound'
2021
+ ]] if task == 'Compound-Protein Binding Affinity' else opts,
2022
+ inputs=[drug_screen_task, drug_screen_opts], outputs=drug_screen_opts,
2023
+ show_progress='hidden'
2024
+ )
2025
 
2026
  def example_fill(input_type):
2027
  return {target_id: 'Q16539',
 
2646
 
2647
  if __name__ == "__main__":
2648
  pandarallel.initialize()
2649
+
2650
  hydra.initialize(version_base="1.3", config_path="configs", job_name="webserver_inference")
2651
+
2652
  session = requests.Session()
2653
  ADAPTER = HTTPAdapter(max_retries=Retry(total=5, backoff_factor=0.1, status_forcelist=[500, 502, 503, 504]))
2654
  session.mount('http://', ADAPTER)
2655
  session.mount('https://', ADAPTER)
2656
+
2657
  db = TinyDB(f'{SERVER_DATA_DIR}/db.json')
2658
  # Set all RUNNING jobs to FAILED at TinyDB initialization
2659
  Job = Query()
 
2661
  for job in jobs:
2662
  if job['status'] == 'RUNNING':
2663
  db.update({'status': 'FAILED'}, Job.id == job['id'])
2664
+
2665
  scheduler = BackgroundScheduler()
2666
  scheduler.add_job(check_expiry, 'interval', hours=1)
2667
  scheduler.start()
2668
+
2669
  demo.queue(default_concurrency_limit=None, max_size=10).launch(show_api=False)