libokj commited on
Commit
c95ea1d
·
verified ·
1 Parent(s): ccdad94

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +379 -112
app.py CHANGED
@@ -27,8 +27,8 @@ from pandarallel import pandarallel
27
  import requests
28
  from requests.adapters import HTTPAdapter, Retry
29
  from markdown import markdown
30
- from rdkit import Chem
31
- from rdkit.Chem import Draw, RDConfig, PandasTools, Descriptors, rdMolDescriptors, rdmolops, Lipinski, Crippen
32
  from rdkit.Chem.Scaffolds import MurckoScaffold
33
  import seaborn as sns
34
 
@@ -196,6 +196,13 @@ TARGET_FAMILY_MAP = {
196
  'Nuclear Receptor': 'nuclear_receptor',
197
  'Ion Channel': 'ion_channel',
198
  'Others': 'others',
 
 
 
 
 
 
 
199
  }
200
 
201
  TARGET_LIBRARY_MAP = {
@@ -247,7 +254,7 @@ def remove_job_record(job_id):
247
  # Delete the job from the database
248
  db.remove(Job.id == job_id)
249
  # Delete the corresponding files
250
- files = glob.glob(f"/data/{job_id}*")
251
  for file_path in files:
252
  if os.path.exists(file_path):
253
  os.remove(file_path)
@@ -265,7 +272,7 @@ def check_expiry():
265
  # Delete the job from the database
266
  db.remove(Job.id == job['id'])
267
  # Delete the corresponding file
268
- files = glob.glob(f"/data/{job['id']}*")
269
  for file_path in files:
270
  if os.path.exists(file_path):
271
  os.remove(file_path)
@@ -278,8 +285,63 @@ def check_expiry():
278
  send_email(job)
279
 
280
 
281
- scheduler.add_job(check_expiry, 'interval', hours=1)
282
- scheduler.start()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
283
 
284
 
285
  def lipinski(mol):
@@ -635,46 +697,155 @@ using the job id. You will also receive an email notification once the job is do
635
  raise gr.Error(f'Failed to retrieve job status due to error: {str(e)}')
636
 
637
 
638
- def submit_predict(predict_filepath, task, preset, target_family, state):
639
  job_id = state['id']
640
  status = "RUNNING"
641
  error = None
642
  task_file_abbr = {'Compound-Protein Interaction': 'CPI', 'Compound-Protein Binding Affinity': 'CPA'}
643
  predictions_file = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
644
  try:
645
- target_family = TARGET_FAMILY_MAP[target_family]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
646
 
647
- predictions_file = f'{SERVER_DATA_DIR}/{job_id}_{task_file_abbr[task]}_{preset}_{target_family}_predictions.csv'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
648
 
649
- task = TASK_MAP[task]
650
- preset = PRESET_MAP[preset]
651
-
652
- prediction_df = pd.DataFrame()
653
- cfg = hydra.compose(
654
- config_name="webserver_inference",
655
- overrides=[f"task={task}",
656
- f"preset={preset}",
657
- f"ckpt_path=resources/checkpoints/{preset}-{task}-{target_family}.ckpt",
658
- f"data.data_file='{str(predict_filepath)}'"])
659
- # with concurrent.futures.ThreadPoolExecutor() as executor:
660
- # future = executor.submit(predict, cfg)
661
- # try:
662
- # predictions, _ = future.result(timeout=4*60*60)
663
- # except concurrent.futures.TimeoutError:
664
- # raise gr.Error("Prediction timed out.")
665
- predictions, _ = predict(cfg)
666
- predictions = [pd.DataFrame(prediction) for prediction in predictions]
667
- prediction_df = pd.concat([prediction_df, pd.concat(predictions, ignore_index=True)])
668
- prediction_df.set_index('N', inplace=True)
669
- orig_df = pd.read_csv(
670
- predict_filepath,
671
- usecols=lambda x: x not in ['X1', 'ID1', 'Compound', 'Scaffold', 'Scaffold SMILES',
672
- 'X2', 'ID2',
673
- 'Y', 'Y^']
674
- )
675
- prediction_df = pd.merge(prediction_df, orig_df, left_index=True, right_index=True, how='left')
676
 
677
- prediction_df.to_csv(predictions_file)
678
  status = "COMPLETED"
679
 
680
  return {run_state: False}
@@ -714,19 +885,21 @@ def update_df(file, progress=gr.Progress(track_tqdm=True)):
714
  task = 'Compound-Protein Binding Affinity'
715
 
716
  df = pd.read_csv(file)
 
717
  if 'N' in df.columns:
718
  df.set_index('N', inplace=True)
 
719
  if not any(col in ['X1', 'X2'] for col in df.columns):
720
  gr.Warning("At least one of columns `X1` and `X2` must be in the uploaded dataset.")
721
  return {analyze_btn: gr.Button(interactive=False)}
 
722
  if 'X1' in df.columns:
723
- df['Scaffold SMILES'] = df['X1'].parallel_apply(MurckoScaffold.MurckoScaffoldSmilesFromSmiles)
724
- df['Scaffold'] = df['Scaffold SMILES'].parallel_apply(
725
- lambda smiles: PandasTools._MolPlusFingerprint(Chem.MolFromSmiles(smiles)))
726
- # Add a new column with RDKit molecule objects
727
  if 'Compound' not in df.columns or df['Compound'].dtype != 'object':
728
  df['Compound'] = df['X1'].parallel_apply(
729
  lambda smiles: PandasTools._MolPlusFingerprint(Chem.MolFromSmiles(smiles)))
 
 
 
730
 
731
  # DF_FOR_REPORT = df.copy()
732
 
@@ -752,7 +925,7 @@ def update_df(file, progress=gr.Progress(track_tqdm=True)):
752
  return {analyze_btn: gr.Button(interactive=False)}
753
 
754
 
755
- def create_html_report(df, file=None, task=None, progress=gr.Progress(track_tqdm=True)):
756
  df_html = df.copy(deep=True)
757
  column_aliases = COLUMN_ALIASES.copy()
758
  cols_left = list(pd.Index(
@@ -763,9 +936,9 @@ def create_html_report(df, file=None, task=None, progress=gr.Progress(track_tqdm
763
  if isinstance(task, str):
764
  column_aliases.update({
765
  'Y': 'Actual Interaction Probability' if task == 'Compound-Protein Interaction'
766
- else 'Actual Binding Affinity',
767
  'Y^': 'Predicted Interaction Probability' if task == 'Compound-Protein Interaction'
768
- else 'Predicted Binding Affinity'
769
  })
770
 
771
  ascending = True if column_aliases['Y^'] == 'Predicted Binding Affinity' else False
@@ -803,12 +976,17 @@ def create_html_report(df, file=None, task=None, progress=gr.Progress(track_tqdm
803
 
804
  elif 'Y^' in df_html.columns:
805
  job = 'Interaction Pair Inference'
806
- if 'Compound' in df_html.columns:
807
  df_html['Compound'] = df_html['Compound'].parallel_apply(
808
  lambda x: PandasTools.PrintAsImageString(x) if not pd.isna(x) else x)
809
- if 'Scaffold' in df_html.columns:
 
 
 
810
  df_html['Scaffold'] = df_html['Scaffold'].parallel_apply(
811
  lambda x: PandasTools.PrintAsImageString(x) if not pd.isna(x) else x)
 
 
812
 
813
  df_html.rename(columns=column_aliases, inplace=True)
814
  df_html.index.name = 'Index'
@@ -1276,7 +1454,7 @@ with gr.Blocks(theme=theme, title='DeepSEQreen', css=CSS, delete_cache=(3600, 48
1276
  "Interaction prediction provides you binding probability score between the target of "
1277
  "interest and each compound in the library, "
1278
  "while affinity prediction directly estimates their binding strength measured using "
1279
- "IC50."
1280
  )
1281
  drug_screen_task = gr.Dropdown(
1282
  list(TASK_MAP.keys()),
@@ -1313,17 +1491,24 @@ with gr.Blocks(theme=theme, title='DeepSEQreen', css=CSS, delete_cache=(3600, 48
1313
  drug_library_upload_btn = gr.UploadButton(
1314
  label='OR Upload Your Own Library', variant='primary')
1315
  drug_library_upload = gr.File(label='Custom compound library file', visible=False)
 
 
 
 
 
 
 
1316
  with gr.Row():
1317
  with gr.Column():
1318
  drug_screen_email = gr.Textbox(
1319
- label='Step 6. Input Your Email Address (Optional)',
1320
  info="Your email address will be used to notify you of the status of your job. "
1321
  "If you cannot receive the email, please check your spam/junk folder."
1322
  )
1323
 
1324
  with gr.Row(visible=True):
1325
  with gr.Column():
1326
- # drug_screen_clr_btn = gr.ClearButton(size='lg')
1327
  drug_screen_btn = gr.Button(value='SUBMIT THE SCREENING JOB', variant='primary', size='lg')
1328
  # TODO Modify the pd df directly with df['X2'] = target
1329
 
@@ -1359,26 +1544,25 @@ with gr.Blocks(theme=theme, title='DeepSEQreen', css=CSS, delete_cache=(3600, 48
1359
  example_drug = gr.Button(value='Example: Aspirin', elem_classes='example')
1360
 
1361
  with gr.Row():
1362
- with gr.Column(visible=False):
1363
  HelpTip(
1364
  "By default, models trained on all protein families (general) will be applied. "
1365
- # "If the proteins in the target library of interest all belong to the same protein "
1366
- # "family, manually selecting the family is supported."
1367
  )
1368
  target_identify_target_family = gr.Dropdown(
1369
- choices=['General'], value='General',
1370
- label='Target Family')
1371
-
1372
- with gr.Row():
1373
  with gr.Column():
1374
  HelpTip(
1375
  "Interaction prediction provides you binding probability score between the target of "
1376
  "interest and each compound in the library, while affinity prediction directly "
1377
- "estimates their binding strength measured using IC50."
1378
  )
1379
  target_identify_task = gr.Dropdown(
1380
  list(TASK_MAP.keys()),
1381
- label='Step 2. Select a Prediction Task',
1382
  value='Compound-Protein Interaction')
1383
 
1384
  with gr.Column():
@@ -1389,8 +1573,8 @@ with gr.Blocks(theme=theme, title='DeepSEQreen', css=CSS, delete_cache=(3600, 48
1389
  "Please refer to the documentation for detailed benchmark results."
1390
  )
1391
  target_identify_preset = gr.Dropdown(
1392
- list(PRESET_MAP.keys()),
1393
- label='Step 3. Select a Preset Model')
1394
  identify_preset_recommend_btn = gr.Button(value='OR Let Us Recommend for You',
1395
  variant='primary')
1396
  with gr.Row():
@@ -1403,7 +1587,7 @@ with gr.Blocks(theme=theme, title='DeepSEQreen', css=CSS, delete_cache=(3600, 48
1403
  "and can be downloaded by clicking the lower right corner."
1404
  )
1405
  target_library = gr.Dropdown(
1406
- label='Step 4. Select a Preset Target Library',
1407
  choices=list(TARGET_LIBRARY_MAP.keys()))
1408
  with gr.Row():
1409
  gr.File(label='Example FASTA target library',
@@ -1414,16 +1598,23 @@ with gr.Blocks(theme=theme, title='DeepSEQreen', css=CSS, delete_cache=(3600, 48
1414
  label='OR Upload Your Own Library', variant='primary')
1415
  target_library_upload = gr.File(label='Custom target library file', visible=False)
1416
 
 
 
 
 
 
 
 
1417
  with gr.Row():
1418
  with gr.Column():
1419
  target_identify_email = gr.Textbox(
1420
- label='Step 5. Input Your Email Address (Optional)',
1421
  info="Your email address will be used to notify you of the status of your job. "
1422
  "If you cannot receive the email, please check your spam/junk folder."
1423
  )
1424
 
1425
  with gr.Row(visible=True):
1426
- # target_identify_clr_btn = gr.ClearButton(size='lg')
1427
  target_identify_btn = gr.Button(value='SUBMIT THE IDENTIFICATION JOB', variant='primary',
1428
  size='lg')
1429
 
@@ -1501,7 +1692,7 @@ with gr.Blocks(theme=theme, title='DeepSEQreen', css=CSS, delete_cache=(3600, 48
1501
  "Interaction prediction provides you binding probability score "
1502
  "between the target of interest and each compound in the library, "
1503
  "while affinity prediction directly estimates their binding strength "
1504
- "measured using IC50."
1505
  )
1506
  pair_infer_task = gr.Dropdown(
1507
  list(TASK_MAP.keys()),
@@ -1525,7 +1716,7 @@ with gr.Blocks(theme=theme, title='DeepSEQreen', css=CSS, delete_cache=(3600, 48
1525
  "If you cannot receive the email, please check your spam/junk folder.")
1526
 
1527
  with gr.Row(visible=True):
1528
- # pair_infer_clr_btn = gr.ClearButton(size='lg')
1529
  pair_infer_btn = gr.Button(value='SUBMIT THE INFERENCE JOB', variant='primary', size='lg')
1530
 
1531
  infer_data_for_predict = gr.File(file_count="single", type='filepath', visible=False)
@@ -1546,25 +1737,33 @@ with gr.Blocks(theme=theme, title='DeepSEQreen', css=CSS, delete_cache=(3600, 48
1546
  Please first `Preview` the report, then `Generate` and download a CSV report
1547
  or an interactive HTML report below if you wish to access the full report.
1548
  ''')
 
 
1549
  with gr.Row():
1550
- with gr.Column():
1551
  file_for_report = gr.File(interactive=True, type='filepath')
1552
  report_task = gr.Dropdown(list(TASK_MAP.keys()), visible=False, value=None,
1553
- label='Specify the Task Labels in the Upload Dataset')
1554
- raw_df = gr.State(value=pd.DataFrame())
1555
- report_df = gr.State(value=pd.DataFrame())
1556
- scores = gr.CheckboxGroup(list(SCORE_MAP.keys()), label='Scores')
1557
- filters = gr.CheckboxGroup(list(FILTER_MAP.keys()), label='Filters')
 
 
 
 
 
 
1558
 
1559
  with gr.Row():
1560
- # clear_btn = gr.ClearButton(size='lg')
1561
- analyze_btn = gr.Button('Preview Top 30 Records', variant='primary', size='lg',
1562
- interactive=False)
1563
 
1564
  with gr.Row():
1565
  with gr.Column(scale=3):
1566
  html_report = gr.HTML() # label='Results', visible=True)
1567
- ranking_pie_chart = gr.Plot(visible=False)
1568
 
1569
  with gr.Row():
1570
  with gr.Column():
@@ -1584,8 +1783,8 @@ with gr.Blocks(theme=theme, title='DeepSEQreen', css=CSS, delete_cache=(3600, 48
1584
  if the job has completed. Note that predictions are only kept for 48 hours upon job completion.
1585
 
1586
  You will be redirected to Chemical Property Report for carrying out further analysis and
1587
- generating the full report if the job is done. If the Lookup fails to respond, please come back
1588
- in five minutes, refresh the page, and try again.
1589
  ''')
1590
  with gr.Column():
1591
  pred_lookup_id = gr.Textbox(
@@ -1689,8 +1888,8 @@ with gr.Blocks(theme=theme, title='DeepSEQreen', css=CSS, delete_cache=(3600, 48
1689
 
1690
  def target_family_detect(fasta, progress=gr.Progress(track_tqdm=True)):
1691
  try:
1692
- aligner = PairwiseAligner(scoring='blastp', mode='local')
1693
- alignment_df = pd.read_csv('data/target_libraries/ChEMBL33_all_spe_single_prot_info.csv')
1694
 
1695
  processed_fasta = process_target_fasta(fasta)
1696
 
@@ -1698,18 +1897,20 @@ with gr.Blocks(theme=theme, title='DeepSEQreen', css=CSS, delete_cache=(3600, 48
1698
  exact_match = alignment_df[alignment_df['X2'] == processed_fasta]
1699
  if not exact_match.empty:
1700
  row = exact_match.iloc[0]
1701
- return gr.Dropdown(value=row['protein_family'],
1702
- info=f"Reason: Exact match found with {row['ID2']} from family {row['protein_family']}")
 
1703
 
1704
  # If no exact match, then calculate alignment score
1705
  def align_score(query):
1706
- return aligner.align(processed_fasta, query).score
 
1707
 
1708
  alignment_df['score'] = alignment_df['X2'].parallel_apply(align_score)
1709
  row = alignment_df.loc[alignment_df['score'].idxmax()]
1710
- return gr.Dropdown(value=row['protein_family'],
1711
- info=f"Reason: Best BLASTP score ({row['score']}) "
1712
- f"with {row['ID2']} from family {row['protein_family']}")
1713
  except Exception as e:
1714
  gr.Warning("Failed to detect the protein family due to error: " + str(e))
1715
 
@@ -1772,7 +1973,7 @@ QALAHAYFAQYHDPDDEPVADPYDQSFESRDLLIDEWKSLTYDEVISFVPPPLDQEEMES
1772
  scenario_general = "Unseen Target"
1773
 
1774
  seen_targets_family = pd.read_csv(
1775
- f'data/benchmarks/seen_targets/{TARGET_FAMILY_MAP[family]}_{task.lower()}_random_split.csv')
1776
  if process_target_fasta(fasta) in seen_targets_family['X2'].values:
1777
  scenario_family = "Seen Target"
1778
  else:
@@ -1787,12 +1988,16 @@ QALAHAYFAQYHDPDDEPVADPYDQSFESRDLLIDEWKSLTYDEVISFVPPPLDQEEMES
1787
  filtered_df = pd.concat([filtered_df_general, filtered_df_family])
1788
 
1789
  row = filtered_df.loc[filtered_df[score].idxmax()]
 
 
 
 
1790
 
1791
  return {drug_screen_preset:
1792
  gr.Dropdown(value=row['Model'],
1793
  info=f"Reason: {row['Scenario']} in training; we recommend the {row['Type']}-trained "
1794
- f"model with the best {score} ({float(row[score]):.3f}) "
1795
- f"in the {row['Scenario']} scenario on {row['Family']}."),
1796
  drug_screen_target_family:
1797
  gr.Dropdown(value='General') if row['Type'] == 'General' else gr.Dropdown(value=family)}
1798
 
@@ -1848,9 +2053,9 @@ QALAHAYFAQYHDPDDEPVADPYDQSFESRDLLIDEWKSLTYDEVISFVPPPLDQEEMES
1848
  gr.Warning('Please enter a valid SMILES for model recommendation.')
1849
  return None
1850
 
1851
- seen_drugs = pd.read_csv(
1852
- f'data/benchmarks/seen_drugs/all_families_full_{task.lower()}_random_split.csv')
1853
- if rdkit_canonicalize(smiles) in seen_drugs['X1'].values:
1854
  scenario = "Seen Compound"
1855
  else:
1856
  scenario = "Unseen Compound"
@@ -1863,8 +2068,7 @@ QALAHAYFAQYHDPDDEPVADPYDQSFESRDLLIDEWKSLTYDEVISFVPPPLDQEEMES
1863
 
1864
  return gr.Dropdown(value=row['Model'],
1865
  info=f"Reason: {scenario} in training; choosing the model "
1866
- f"with the best {score} ({float(row[score]):.3f}) "
1867
- f"in the {scenario} scenario.")
1868
 
1869
 
1870
  identify_preset_recommend_btn.click(fn=identify_recommend_model,
@@ -1965,7 +2169,7 @@ QALAHAYFAQYHDPDDEPVADPYDQSFESRDLLIDEWKSLTYDEVISFVPPPLDQEEMES
1965
 
1966
  job_id = str(uuid4())
1967
  temp_file = Path(f'{SERVER_DATA_DIR}/{job_id}_input.csv').resolve()
1968
- screen_df.to_csv(temp_file, index=False)
1969
  if temp_file.is_file():
1970
  job_info = common_job_initiate(job_id, 'Drug Hit Screening', email, request, task)
1971
  return {screen_data_for_predict: str(temp_file),
@@ -1995,7 +2199,7 @@ QALAHAYFAQYHDPDDEPVADPYDQSFESRDLLIDEWKSLTYDEVISFVPPPLDQEEMES
1995
 
1996
  job_id = str(uuid4())
1997
  temp_file = Path(f'{SERVER_DATA_DIR}/{job_id}_input.csv').resolve()
1998
- identify_df.to_csv(temp_file, index=False)
1999
  if temp_file.is_file():
2000
  job_info = common_job_initiate(job_id, 'Target Protein Identification', email, request, task)
2001
  return {identify_data_for_predict: str(temp_file),
@@ -2043,7 +2247,7 @@ QALAHAYFAQYHDPDDEPVADPYDQSFESRDLLIDEWKSLTYDEVISFVPPPLDQEEMES
2043
  f'than the allowed maximum {DATASET_MAX_LEN}.')
2044
 
2045
  temp_file = Path(f'{SERVER_DATA_DIR}/{job_id}_input.csv').resolve()
2046
- infer_df.to_csv(temp_file, index=False)
2047
 
2048
  else:
2049
  raise gr.Error('Should upload a compound-protein pair dataset, or '
@@ -2093,10 +2297,54 @@ QALAHAYFAQYHDPDDEPVADPYDQSFESRDLLIDEWKSLTYDEVISFVPPPLDQEEMES
2093
  drug_screen_click.success(
2094
  fn=submit_predict,
2095
  inputs=[screen_data_for_predict, drug_screen_task, drug_screen_preset,
2096
- drug_screen_target_family, run_state, ],
2097
  outputs=[run_state, ]
2098
  )
2099
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2100
  target_identify_click = target_identify_btn.click(
2101
  fn=target_identify_validate,
2102
  inputs=[compound_smiles, target_library, target_library_upload, target_identify_preset, target_identify_task,
@@ -2125,7 +2373,7 @@ QALAHAYFAQYHDPDDEPVADPYDQSFESRDLLIDEWKSLTYDEVISFVPPPLDQEEMES
2125
  target_identify_click.success(
2126
  fn=submit_predict,
2127
  inputs=[identify_data_for_predict, target_identify_task, target_identify_preset,
2128
- target_identify_target_family, run_state, ], # , target_identify_email],
2129
  outputs=[run_state, ]
2130
  )
2131
 
@@ -2200,6 +2448,9 @@ QALAHAYFAQYHDPDDEPVADPYDQSFESRDLLIDEWKSLTYDEVISFVPPPLDQEEMES
2200
  report_df_change = file_for_report.change(
2201
  fn=update_df, inputs=file_for_report, outputs=[html_report, raw_df, report_df, analyze_btn, report_task],
2202
  concurrency_limit=100,
 
 
 
2203
  )
2204
 
2205
  file_for_report.upload(
@@ -2214,8 +2465,8 @@ QALAHAYFAQYHDPDDEPVADPYDQSFESRDLLIDEWKSLTYDEVISFVPPPLDQEEMES
2214
  file_for_report.clear(
2215
  fn=lambda: [gr.Button(interactive=False)] * 3 +
2216
  [gr.File(visible=False, value=None)] * 2 +
2217
- [gr.Dropdown(visible=False, value=None),
2218
- gr.HTML(visible=False)],
2219
  outputs=[
2220
  csv_generate, html_generate, analyze_btn, csv_download_file, html_download_file, report_task, html_report
2221
  ]
@@ -2234,11 +2485,23 @@ QALAHAYFAQYHDPDDEPVADPYDQSFESRDLLIDEWKSLTYDEVISFVPPPLDQEEMES
2234
  outputs=analyze_btn)
2235
 
2236
 
2237
- def create_csv_report_file(df, file_report, progress=gr.Progress(track_tqdm=True)):
 
 
 
 
 
 
 
 
 
 
2238
  try:
2239
  now = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
2240
- filename = f"/data/{Path(file_report.name).stem}_DeepSEQreen_report_{now}.csv"
2241
- df.drop(labels=['Compound', 'Scaffold'], axis=1).to_csv(filename, index=True, na_rep='')
 
 
2242
 
2243
  return gr.File(filename)
2244
  except Exception as e:
@@ -2246,28 +2509,32 @@ QALAHAYFAQYHDPDDEPVADPYDQSFESRDLLIDEWKSLTYDEVISFVPPPLDQEEMES
2246
  return None
2247
 
2248
 
2249
- def create_html_report_file(df, file_report, task, progress=gr.Progress(track_tqdm=True)):
2250
  try:
2251
  now = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
2252
- filename = f"/data/{Path(file_report.name).stem}_DeepSEQreen_report_{now}.html"
2253
- create_html_report(df, filename, task)
2254
  return gr.File(filename, visible=True)
2255
  except Exception as e:
2256
  gr.Warning(f"Failed to generate HTML due to error: {str(e)}")
2257
  return None
2258
 
2259
 
2260
- html_report.change(lambda: [gr.Button(visible=True)] * 2, outputs=[csv_generate, html_generate])
 
2261
  csv_generate.click(
2262
- lambda: [gr.Button(visible=False), gr.File(visible=True)], outputs=[csv_generate, csv_download_file],
2263
- ).then(fn=create_csv_report_file, inputs=[report_df, file_for_report],
2264
  outputs=csv_download_file, show_progress='full')
2265
  html_generate.click(
2266
- lambda: [gr.Button(visible=False), gr.File(visible=True)], outputs=[html_generate, html_download_file],
2267
- ).then(fn=create_html_report_file, inputs=[report_df, file_for_report, report_task],
2268
  outputs=html_download_file, show_progress='full')
2269
 
 
2270
  if __name__ == "__main__":
 
2271
  hydra.initialize(version_base="1.3", config_path="configs", job_name="webserver_inference")
2272
- pandarallel.initialize(progress_bar=True)
2273
  demo.queue(default_concurrency_limit=None, max_size=10).launch(show_api=False)
 
 
 
27
  import requests
28
  from requests.adapters import HTTPAdapter, Retry
29
  from markdown import markdown
30
+ from rdkit import Chem, DataStructs
31
+ from rdkit.Chem import Draw, RDConfig, PandasTools, Descriptors, rdMolDescriptors, rdmolops, Lipinski, Crippen, AllChem
32
  from rdkit.Chem.Scaffolds import MurckoScaffold
33
  import seaborn as sns
34
 
 
196
  'Nuclear Receptor': 'nuclear_receptor',
197
  'Ion Channel': 'ion_channel',
198
  'Others': 'others',
199
+ # 'general': 'general',
200
+ # 'kinase': 'kinase',
201
+ # 'non-kinase enzyme': 'non_kinase_enzyme',
202
+ # 'membrane receptor': 'membrane_receptor',
203
+ # 'nuclear Receptor': 'nuclear_receptor',
204
+ # 'ion channel': 'ion_channel',
205
+ # 'others': 'others',
206
  }
207
 
208
  TARGET_LIBRARY_MAP = {
 
254
  # Delete the job from the database
255
  db.remove(Job.id == job_id)
256
  # Delete the corresponding files
257
+ files = glob.glob(f"{SERVER_DATA_DIR}/{job_id}*")
258
  for file_path in files:
259
  if os.path.exists(file_path):
260
  os.remove(file_path)
 
272
  # Delete the job from the database
273
  db.remove(Job.id == job['id'])
274
  # Delete the corresponding file
275
+ files = glob.glob(f"{SERVER_DATA_DIR}/{job['id']}*")
276
  for file_path in files:
277
  if os.path.exists(file_path):
278
  os.remove(file_path)
 
285
  send_email(job)
286
 
287
 
288
+ @cache
289
+ def max_tanimoto_similarity(smi, seen_smiles):
290
+ if smi is None:
291
+ return 0
292
+ mol = Chem.MolFromSmiles(smi)
293
+ if mol is None:
294
+ return 0
295
+ mol_ecfp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048)
296
+ max_sim = 0
297
+ for smiles in seen_smiles:
298
+ mol_seen = Chem.MolFromSmiles(smiles)
299
+ mol_seen_ecfp = AllChem.GetMorganFingerprintAsBitVect(mol_seen, radius=2, nBits=2048)
300
+ sim = DataStructs.TanimotoSimilarity(mol_ecfp, mol_seen_ecfp)
301
+ if sim == 1:
302
+ return 1
303
+ max_sim = max(sim, max_sim)
304
+ return max_sim
305
+
306
+
307
+ @cache
308
+ def max_sequence_identity(seq, seen_fastas):
309
+ if seq is None:
310
+ return 0
311
+ aligner = PairwiseAligner()
312
+ aligner.mode = 'local'
313
+ max_id = 0
314
+ for fasta in seen_fastas:
315
+ alignment = aligner.align(seq, fasta)
316
+ identity = alignment.score / max(len(seq), len(fasta))
317
+ if identity == 1:
318
+ return 1
319
+ max_id = max(identity, max_id)
320
+ return max_id
321
+
322
+
323
+ @cache
324
+ def get_seen_smiles(family, task):
325
+ seen_smiles = pd.read_csv(
326
+ f'data/benchmarks/seen_compounds/{TARGET_FAMILY_MAP[family.title()]}_{task.lower()}_random_split.csv')
327
+ return seen_smiles['X1'].tolist()
328
+
329
+
330
+ @cache
331
+ def get_seen_fastas(family, task):
332
+ seen_fastas = pd.read_csv(
333
+ f'data/benchmarks/seen_targets/{TARGET_FAMILY_MAP[family.title()]}_{task.lower()}_random_split.csv')
334
+ return seen_fastas['X2'].tolist()
335
+
336
+
337
+ @cache
338
+ def get_fasta_family_map():
339
+ usecols = ['X2', 'ID2', 'Target Family']
340
+ fasta_family_map = pd.concat([
341
+ pd.read_csv('data/target_libraries/ChEMBL33_all_spe_single_prot_info.csv', usecols=usecols),
342
+ pd.read_csv('data/target_libraries/idmapping_not_in_chembl.csv', usecols=usecols)
343
+ ]).drop_duplicates(subset=['X2'], keep='first')
344
+ return fasta_family_map
345
 
346
 
347
  def lipinski(mol):
 
697
  raise gr.Error(f'Failed to retrieve job status due to error: {str(e)}')
698
 
699
 
700
+ def submit_predict(predict_filepath, task, preset, target_family, opts, state):
701
  job_id = state['id']
702
  status = "RUNNING"
703
  error = None
704
  task_file_abbr = {'Compound-Protein Interaction': 'CPI', 'Compound-Protein Binding Affinity': 'CPA'}
705
  predictions_file = None
706
+
707
+ df_training = pd.read_csv(f'data/complete_{TASK_MAP[task].lower()}_dataset.csv')
708
+ orig_df = pd.read_csv(predict_filepath)
709
+ alignment_df = get_fasta_family_map()
710
+ prediction_df = pd.DataFrame()
711
+
712
+ @cache
713
+ def detect_family(query):
714
+ # Check for an exact match first
715
+ exact_match = alignment_df[alignment_df['X2'] == query]
716
+ if not exact_match.empty:
717
+ row = exact_match.iloc[0]
718
+ return row['Target Family']
719
+ # If no exact match, then calculate alignment score
720
+ else:
721
+ aligner = PairwiseAligner(mode='local')
722
+
723
+ def align_score(target):
724
+ alignment = aligner.align(query, target)
725
+ return alignment.score / max(len(query), len(target))
726
+
727
+ alignment_df['score'] = alignment_df['X2'].apply(align_score)
728
+ row = alignment_df.loc[alignment_df['score'].idxmax()]
729
+ return row['Target Family']
730
+
731
+ if 'Target Family' not in orig_df.columns:
732
+ orig_df['Target Family'] = None
733
+ orig_df.loc[
734
+ orig_df['Target Family'].isna(), 'Target Family'
735
+ ] = orig_df.loc[
736
+ orig_df['Target Family'].isna(), 'X2'
737
+ ].parallel_apply(detect_family)
738
+
739
+ detect_family.cache_clear()
740
+
741
+ orig_df = orig_df.merge(df_training[['X1', 'X2', 'Y']], on=['X1', 'X2'], how='left', indicator=False)
742
+ annotated_df = orig_df[~orig_df['Y'].isna()].copy()
743
+ annotated_df.rename(columns={'Y': 'Y^'}, inplace=True)
744
+ annotated_df['Prediction Source'] = 'Training Data'
745
+ # Resave the unannotated data
746
+ unannotated_df = orig_df[orig_df['Y'].isna()].drop(['Y', 'Target Family'], axis=1)
747
+ if not unannotated_df.empty:
748
+ unannotated_df.to_csv(predict_filepath, index=False, na_rep='')
749
+ else:
750
+ annotated_df.to_csv(predictions_file, index=False, na_rep='')
751
+ status = "COMPLETED"
752
+ return {run_state: False}
753
+
754
+ columns_to_drop = ['ID1', 'Compound', 'Scaffold', 'Scaffold SMILES', 'ID2', 'Y', 'Y^']
755
+ columns_to_drop = [col for col in columns_to_drop if col in orig_df.columns]
756
+ orig_df.drop(columns_to_drop, axis=1, inplace=True)
757
+
758
  try:
759
+ if target_family != 'Family-Specific Auto-Recommendation':
760
+ target_family_value = TARGET_FAMILY_MAP[target_family.title()]
761
+ task_value = TASK_MAP[task]
762
+ preset_value = PRESET_MAP[preset]
763
+ predictions_file = (f'{SERVER_DATA_DIR}/'
764
+ f'{job_id}_{task_file_abbr[task]}_{preset_value}_{target_family_value}_predictions.csv')
765
+
766
+ cfg = hydra.compose(
767
+ config_name="webserver_inference",
768
+ overrides=[f"task={task_value}",
769
+ f"preset={preset_value}",
770
+ f"ckpt_path=resources/checkpoints/{preset_value}-{task_value}-{target_family_value}.ckpt",
771
+ f"data.data_file='{str(predict_filepath)}'"])
772
+
773
+ predictions, _ = predict(cfg)
774
+ predictions = pd.concat([pd.DataFrame(prediction) for prediction in predictions], ignore_index=True)
775
+ predictions['Prediction Source'] = f'{preset} ({target_family})'
776
+ prediction_df = pd.concat([prediction_df, predictions])
777
 
778
+ else:
779
+ predictions_file = f'{SERVER_DATA_DIR}/{job_id}_{task_file_abbr[task]}_{preset}_auto_predictions.csv'
780
+ task_value = TASK_MAP[task]
781
+ score = TASK_METRIC_MAP[task]
782
+ benchmark_df = pd.read_csv(f'data/benchmarks/{task_value}_test_metrics.csv')
783
+ predict_df = pd.read_csv(predict_filepath)
784
+
785
+ for family, subset in predict_df.groupby('Target Family'):
786
+ predict_subset_filepath = f'{SERVER_DATA_DIR}/{job_id}_{family}_input.csv'
787
+ subset.to_csv(predict_subset_filepath, index=False, na_rep='')
788
+ seen_compounds = get_seen_smiles(family, task_value)
789
+
790
+ if subset['X1'].iloc[0] in seen_compounds['X1'].values:
791
+ scenario = "Seen Compound"
792
+ else:
793
+ scenario = "Unseen Compound"
794
+
795
+ filtered_df = benchmark_df[(benchmark_df['Family'] == target_family.title())
796
+ & (benchmark_df['Scenario'] == scenario)]
797
+
798
+ preset = filtered_df.loc[filtered_df[score].idxmax(), 'preset']
799
+ preset_value = PRESET_MAP[preset]
800
+
801
+ target_family = TARGET_FAMILY_MAP[family.title()]
802
+ cfg = hydra.compose(
803
+ config_name="webserver_inference",
804
+ overrides=[f"task={task_value}",
805
+ f"preset={preset_value}",
806
+ f"ckpt_path=resources/checkpoints/{preset_value}-{task_value}-{target_family}.ckpt",
807
+ f"data.data_file='{str(predict_subset_filepath)}'"])
808
+
809
+ predictions, _ = predict(cfg)
810
+ predictions = pd.concat([pd.DataFrame(prediction) for prediction in predictions], ignore_index=True)
811
+ predictions['Prediction Source'] = f'{preset} ({family})'
812
+ prediction_df = pd.concat([prediction_df, predictions])
813
+
814
+ prediction_df = prediction_df.merge(orig_df, on=['X1', 'X2'], how='left', indicator=False)
815
+ prediction_df = pd.concat([prediction_df, annotated_df], ignore_index=True)
816
+
817
+ # prediction_df['Max. Tanimoto Similarity'] = prediction_df.groupby('Target Family')['X1'].apply(
818
+ # lambda group: group.parallel_apply(
819
+ # max_tanimoto_similarity,
820
+ # seen_smiles=tuple(get_seen_smiles(family=group.name, task=task_value))
821
+ # )
822
+ # ).values
823
+ #
824
+ # prediction_df['Max. Sequence Identity'] = prediction_df.groupby('Target Family')['X2'].apply(
825
+ # lambda group: group.parallel_apply(
826
+ # max_sequence_identity,
827
+ # seen_fastas=tuple(get_seen_fastas(family=group.name, task=task_value))
828
+ # )
829
+ # ).values
830
+ if "Include Max. Tanimoto Similarity" in opts:
831
+ for family in prediction_df['Target Family'].unique():
832
+ prediction_df.loc[
833
+ prediction_df['Target Family'] == family, 'Max. Tanimoto Similarity'] = prediction_df.loc[
834
+ prediction_df['Target Family'] == family, 'X1'].parallel_apply(
835
+ max_tanimoto_similarity,
836
+ seen_smiles=tuple(get_seen_smiles(family=family, task=task_value))
837
+ )
838
 
839
+ if "Include Max. Sequence Identity" in opts:
840
+ for family in prediction_df['Target Family'].unique():
841
+ prediction_df.loc[
842
+ prediction_df['Target Family'] == family, 'Max. Sequence Identity'] = prediction_df.loc[
843
+ prediction_df['Target Family'] == family, 'X2'].parallel_apply(
844
+ max_sequence_identity,
845
+ seen_fastas=tuple(get_seen_fastas(family=family, task=task_value))
846
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
847
 
848
+ prediction_df.drop(['N'], axis=1).to_csv(predictions_file, index=False, na_rep='')
849
  status = "COMPLETED"
850
 
851
  return {run_state: False}
 
885
  task = 'Compound-Protein Binding Affinity'
886
 
887
  df = pd.read_csv(file)
888
+
889
  if 'N' in df.columns:
890
  df.set_index('N', inplace=True)
891
+
892
  if not any(col in ['X1', 'X2'] for col in df.columns):
893
  gr.Warning("At least one of columns `X1` and `X2` must be in the uploaded dataset.")
894
  return {analyze_btn: gr.Button(interactive=False)}
895
+
896
  if 'X1' in df.columns:
 
 
 
 
897
  if 'Compound' not in df.columns or df['Compound'].dtype != 'object':
898
  df['Compound'] = df['X1'].parallel_apply(
899
  lambda smiles: PandasTools._MolPlusFingerprint(Chem.MolFromSmiles(smiles)))
900
+ df['Scaffold'] = df['Compound'].parallel_apply(MurckoScaffold.GetScaffoldForMol)
901
+ df['Scaffold SMILES'] = df['Scaffold'].parallel_apply(lambda x: Chem.MolToSmiles(x))
902
+
903
 
904
  # DF_FOR_REPORT = df.copy()
905
 
 
925
  return {analyze_btn: gr.Button(interactive=False)}
926
 
927
 
928
+ def create_html_report(df, file=None, task=None, opts=(), progress=gr.Progress(track_tqdm=True)):
929
  df_html = df.copy(deep=True)
930
  column_aliases = COLUMN_ALIASES.copy()
931
  cols_left = list(pd.Index(
 
936
  if isinstance(task, str):
937
  column_aliases.update({
938
  'Y': 'Actual Interaction Probability' if task == 'Compound-Protein Interaction'
939
+ else 'Actual Binding Affinity pIC50 [nM]',
940
  'Y^': 'Predicted Interaction Probability' if task == 'Compound-Protein Interaction'
941
+ else 'Predicted Binding Affinity (pIC50 [nM])'
942
  })
943
 
944
  ascending = True if column_aliases['Y^'] == 'Predicted Binding Affinity' else False
 
976
 
977
  elif 'Y^' in df_html.columns:
978
  job = 'Interaction Pair Inference'
979
+ if 'Compound' in df_html.columns and 'Exclude Molecular Graph' not in opts:
980
  df_html['Compound'] = df_html['Compound'].parallel_apply(
981
  lambda x: PandasTools.PrintAsImageString(x) if not pd.isna(x) else x)
982
+ else:
983
+ df_html.drop(['Compound'], axis=1, inplace=True)
984
+
985
+ if 'Scaffold' in df_html.columns and 'Exclude Scaffold Graph' not in opts:
986
  df_html['Scaffold'] = df_html['Scaffold'].parallel_apply(
987
  lambda x: PandasTools.PrintAsImageString(x) if not pd.isna(x) else x)
988
+ else:
989
+ df_html.drop(['Scaffold'], axis=1, inplace=True)
990
 
991
  df_html.rename(columns=column_aliases, inplace=True)
992
  df_html.index.name = 'Index'
 
1454
  "Interaction prediction provides you binding probability score between the target of "
1455
  "interest and each compound in the library, "
1456
  "while affinity prediction directly estimates their binding strength measured using "
1457
+ "pIC<sub>50</sub> in units of nM."
1458
  )
1459
  drug_screen_task = gr.Dropdown(
1460
  list(TASK_MAP.keys()),
 
1491
  drug_library_upload_btn = gr.UploadButton(
1492
  label='OR Upload Your Own Library', variant='primary')
1493
  drug_library_upload = gr.File(label='Custom compound library file', visible=False)
1494
+ with gr.Column():
1495
+ drug_screen_opts = gr.CheckboxGroup(
1496
+ ['Include Max. Tanimoto Similarity'],
1497
+ label='Step 6. Select Additional Options',
1498
+ info="Calculating the maximum Tanimoto similarity of the library compounds to the "
1499
+ "training dataset is an experimental feature and may take a considerable amount of time."
1500
+ )
1501
  with gr.Row():
1502
  with gr.Column():
1503
  drug_screen_email = gr.Textbox(
1504
+ label='Step 7. Input Your Email Address (Optional)',
1505
  info="Your email address will be used to notify you of the status of your job. "
1506
  "If you cannot receive the email, please check your spam/junk folder."
1507
  )
1508
 
1509
  with gr.Row(visible=True):
1510
  with gr.Column():
1511
+ drug_screen_clr_btn = gr.ClearButton(size='lg')
1512
  drug_screen_btn = gr.Button(value='SUBMIT THE SCREENING JOB', variant='primary', size='lg')
1513
  # TODO Modify the pd df directly with df['X2'] = target
1514
 
 
1544
  example_drug = gr.Button(value='Example: Aspirin', elem_classes='example')
1545
 
1546
  with gr.Row():
1547
+ with gr.Column(visible=True):
1548
  HelpTip(
1549
  "By default, models trained on all protein families (general) will be applied. "
1550
+ "If you upload a target library containing proteins all in the same family, "
1551
+ "you may manually select a Target Family."
1552
  )
1553
  target_identify_target_family = gr.Dropdown(
1554
+ choices=['Family-Specific Auto-Recommendation'] + list(TARGET_FAMILY_MAP.keys()),
1555
+ value='General',
1556
+ label='Step 2. Select Target Family')
 
1557
  with gr.Column():
1558
  HelpTip(
1559
  "Interaction prediction provides you binding probability score between the target of "
1560
  "interest and each compound in the library, while affinity prediction directly "
1561
+ "estimates their binding strength measured using pIC<sub>50</sub> in units of nM."
1562
  )
1563
  target_identify_task = gr.Dropdown(
1564
  list(TASK_MAP.keys()),
1565
+ label='Step 3. Select a Prediction Task',
1566
  value='Compound-Protein Interaction')
1567
 
1568
  with gr.Column():
 
1573
  "Please refer to the documentation for detailed benchmark results."
1574
  )
1575
  target_identify_preset = gr.Dropdown(
1576
+ ['Family-Specific Auto-Recommendation'] + list(PRESET_MAP.keys()),
1577
+ label='Step 4. Select a Preset Model')
1578
  identify_preset_recommend_btn = gr.Button(value='OR Let Us Recommend for You',
1579
  variant='primary')
1580
  with gr.Row():
 
1587
  "and can be downloaded by clicking the lower right corner."
1588
  )
1589
  target_library = gr.Dropdown(
1590
+ label='Step 5. Select a Preset Target Library',
1591
  choices=list(TARGET_LIBRARY_MAP.keys()))
1592
  with gr.Row():
1593
  gr.File(label='Example FASTA target library',
 
1598
  label='OR Upload Your Own Library', variant='primary')
1599
  target_library_upload = gr.File(label='Custom target library file', visible=False)
1600
 
1601
+ with gr.Column():
1602
+ target_identify_opts = gr.CheckboxGroup(
1603
+ ['Include Max. Sequence Identity'],
1604
+ label='Step 6. Select Additional Options',
1605
+ info="Calculating the maximum sequence identity of the library protein to the "
1606
+ "training dataset is an experimental feature and may take a considerable amount of time."
1607
+ )
1608
  with gr.Row():
1609
  with gr.Column():
1610
  target_identify_email = gr.Textbox(
1611
+ label='Step 7. Input Your Email Address (Optional)',
1612
  info="Your email address will be used to notify you of the status of your job. "
1613
  "If you cannot receive the email, please check your spam/junk folder."
1614
  )
1615
 
1616
  with gr.Row(visible=True):
1617
+ target_identify_clr_btn = gr.ClearButton(size='lg')
1618
  target_identify_btn = gr.Button(value='SUBMIT THE IDENTIFICATION JOB', variant='primary',
1619
  size='lg')
1620
 
 
1692
  "Interaction prediction provides you binding probability score "
1693
  "between the target of interest and each compound in the library, "
1694
  "while affinity prediction directly estimates their binding strength "
1695
+ "measured using pIC<sub>50</sub> in units of nM."
1696
  )
1697
  pair_infer_task = gr.Dropdown(
1698
  list(TASK_MAP.keys()),
 
1716
  "If you cannot receive the email, please check your spam/junk folder.")
1717
 
1718
  with gr.Row(visible=True):
1719
+ pair_infer_clr_btn = gr.ClearButton(size='lg')
1720
  pair_infer_btn = gr.Button(value='SUBMIT THE INFERENCE JOB', variant='primary', size='lg')
1721
 
1722
  infer_data_for_predict = gr.File(file_count="single", type='filepath', visible=False)
 
1737
  Please first `Preview` the report, then `Generate` and download a CSV report
1738
  or an interactive HTML report below if you wish to access the full report.
1739
  ''')
1740
+ raw_df = gr.State(value=pd.DataFrame())
1741
+ report_df = gr.State(value=pd.DataFrame())
1742
  with gr.Row():
1743
+ with gr.Column(scale=1):
1744
  file_for_report = gr.File(interactive=True, type='filepath')
1745
  report_task = gr.Dropdown(list(TASK_MAP.keys()), visible=False, value=None,
1746
+ label='Specify the Task Labels in the Uploaded Dataset')
1747
+ with gr.Column(scale=2):
1748
+ with gr.Row():
1749
+ scores = gr.CheckboxGroup(list(SCORE_MAP.keys()), label='Compound Scores')
1750
+ filters = gr.CheckboxGroup(list(FILTER_MAP.keys()), label='Compound Filters')
1751
+ with gr.Accordion('Report Generate Options', open=False):
1752
+ with gr.Row():
1753
+ csv_sep = gr.Radio(label='CSV Delimiter',
1754
+ choices=['Comma', 'Tab'], value='Comma')
1755
+ html_opts = gr.CheckboxGroup(label='HTML Report Options',
1756
+ choices=['Exclude Molecular Graph', 'Exclude Scaffold Graph'])
1757
 
1758
  with gr.Row():
1759
+ report_clr_btn = gr.ClearButton(size='lg')
1760
+ analyze_btn = gr.Button('Calculate Properties and Preview', variant='primary',
1761
+ size='lg', interactive=False)
1762
 
1763
  with gr.Row():
1764
  with gr.Column(scale=3):
1765
  html_report = gr.HTML() # label='Results', visible=True)
1766
+ ranking_pie_chart = gr.Plot(visible=False)
1767
 
1768
  with gr.Row():
1769
  with gr.Column():
 
1783
  if the job has completed. Note that predictions are only kept for 48 hours upon job completion.
1784
 
1785
  You will be redirected to Chemical Property Report for carrying out further analysis and
1786
+ generating the full report when the job is done. If the Lookup fails to respond, please wait for a
1787
+ few minutes and refresh the page to try again.
1788
  ''')
1789
  with gr.Column():
1790
  pred_lookup_id = gr.Textbox(
 
1888
 
1889
  def target_family_detect(fasta, progress=gr.Progress(track_tqdm=True)):
1890
  try:
1891
+ aligner = PairwiseAligner(mode='local')
1892
+ alignment_df = get_fasta_family_map()
1893
 
1894
  processed_fasta = process_target_fasta(fasta)
1895
 
 
1897
  exact_match = alignment_df[alignment_df['X2'] == processed_fasta]
1898
  if not exact_match.empty:
1899
  row = exact_match.iloc[0]
1900
+ return gr.Dropdown(
1901
+ value=row['Target Family'],
1902
+ info=f"Reason: Exact match found with {row['ID2']} from family {row['Target Family']}")
1903
 
1904
  # If no exact match, then calculate alignment score
1905
  def align_score(query):
1906
+ alignment = aligner.align(processed_fasta, query)
1907
+ return alignment.score / max(len(processed_fasta), len(query))
1908
 
1909
  alignment_df['score'] = alignment_df['X2'].parallel_apply(align_score)
1910
  row = alignment_df.loc[alignment_df['score'].idxmax()]
1911
+ return gr.Dropdown(value=row['Target Family'],
1912
+ info=f"Reason: Best sequence identity ({row['score']}) "
1913
+ f"with {row['ID2']} from family {row['Target Family']}")
1914
  except Exception as e:
1915
  gr.Warning("Failed to detect the protein family due to error: " + str(e))
1916
 
 
1973
  scenario_general = "Unseen Target"
1974
 
1975
  seen_targets_family = pd.read_csv(
1976
+ f'data/benchmarks/seen_targets/{TARGET_FAMILY_MAP[family.title()]}_{task.lower()}_random_split.csv')
1977
  if process_target_fasta(fasta) in seen_targets_family['X2'].values:
1978
  scenario_family = "Seen Target"
1979
  else:
 
1988
  filtered_df = pd.concat([filtered_df_general, filtered_df_family])
1989
 
1990
  row = filtered_df.loc[filtered_df[score].idxmax()]
1991
+ if row['Scenario'] == 'Seen Target':
1992
+ scenario = "Seen Target (>=0.85 sequence identity)"
1993
+ elif row['Scenario'] == 'Unseen Target':
1994
+ scenario = "Unseen Target (<0.85 sequence identity)"
1995
 
1996
  return {drug_screen_preset:
1997
  gr.Dropdown(value=row['Model'],
1998
  info=f"Reason: {row['Scenario']} in training; we recommend the {row['Type']}-trained "
1999
+ f"model with the best {score} in the {scenario} scenario "
2000
+ f"on {row['Family']}."),
2001
  drug_screen_target_family:
2002
  gr.Dropdown(value='General') if row['Type'] == 'General' else gr.Dropdown(value=family)}
2003
 
 
2053
  gr.Warning('Please enter a valid SMILES for model recommendation.')
2054
  return None
2055
 
2056
+ seen_compounds = pd.read_csv(
2057
+ f'data/benchmarks/seen_compounds/all_families_full_{task.lower()}_random_split.csv')
2058
+ if rdkit_canonicalize(smiles) in seen_compounds['X1'].values:
2059
  scenario = "Seen Compound"
2060
  else:
2061
  scenario = "Unseen Compound"
 
2068
 
2069
  return gr.Dropdown(value=row['Model'],
2070
  info=f"Reason: {scenario} in training; choosing the model "
2071
+ f"with the best {score} in the {scenario} scenario.")
 
2072
 
2073
 
2074
  identify_preset_recommend_btn.click(fn=identify_recommend_model,
 
2169
 
2170
  job_id = str(uuid4())
2171
  temp_file = Path(f'{SERVER_DATA_DIR}/{job_id}_input.csv').resolve()
2172
+ screen_df.to_csv(temp_file, index=False, na_rep='')
2173
  if temp_file.is_file():
2174
  job_info = common_job_initiate(job_id, 'Drug Hit Screening', email, request, task)
2175
  return {screen_data_for_predict: str(temp_file),
 
2199
 
2200
  job_id = str(uuid4())
2201
  temp_file = Path(f'{SERVER_DATA_DIR}/{job_id}_input.csv').resolve()
2202
+ identify_df.to_csv(temp_file, index=False, na_rep='')
2203
  if temp_file.is_file():
2204
  job_info = common_job_initiate(job_id, 'Target Protein Identification', email, request, task)
2205
  return {identify_data_for_predict: str(temp_file),
 
2247
  f'than the allowed maximum {DATASET_MAX_LEN}.')
2248
 
2249
  temp_file = Path(f'{SERVER_DATA_DIR}/{job_id}_input.csv').resolve()
2250
+ infer_df.to_csv(temp_file, index=False, na_rep='')
2251
 
2252
  else:
2253
  raise gr.Error('Should upload a compound-protein pair dataset, or '
 
2297
  drug_screen_click.success(
2298
  fn=submit_predict,
2299
  inputs=[screen_data_for_predict, drug_screen_task, drug_screen_preset,
2300
+ drug_screen_target_family, drug_screen_opts, run_state, ],
2301
  outputs=[run_state, ]
2302
  )
2303
 
2304
+ drug_screen_clr_btn.click(
2305
+ lambda: ['General'] + [None] * 5,
2306
+ outputs=[drug_screen_target_family,
2307
+ target_fasta, drug_screen_preset, drug_library, drug_library_upload, drug_screen_email])
2308
+
2309
+ target_identify_clr_btn.click(
2310
+ lambda: ['General'] + [None] * 5,
2311
+ outputs=[target_identify_target_family,
2312
+ compound_smiles, target_identify_preset, target_library, target_library_upload, target_identify_email])
2313
+
2314
+ pair_infer_clr_btn.click(
2315
+ lambda: ['General'] + [None] * 4,
2316
+ outputs=[pair_infer_target_family,
2317
+ infer_pair, infer_drug, infer_target, pair_infer_preset, pair_infer_email])
2318
+
2319
+ report_clr_btn.click(
2320
+ lambda: ['General'] + [None] * 4,
2321
+ outputs=[scores,
2322
+ target_fasta, drug_screen_preset, drug_library, drug_library_upload, drug_screen_email])
2323
+
2324
+
2325
+ def update_preset(family, preset):
2326
+ if family == 'Family-Specific Auto-Recommendation':
2327
+ return 'Family-Specific Auto-Recommendation'
2328
+ elif preset == 'Family-Specific Auto-Recommendation':
2329
+ return None
2330
+ else:
2331
+ return preset
2332
+
2333
+ def update_family(family, preset):
2334
+ if preset == 'Family-Specific Auto-Recommendation':
2335
+ return 'Family-Specific Auto-Recommendation'
2336
+ elif family == 'Family-Specific Auto-Recommendation':
2337
+ return None
2338
+ else:
2339
+ return family
2340
+
2341
+ target_identify_target_family.change(
2342
+ fn=update_preset, inputs=[target_identify_target_family, target_identify_preset],
2343
+ outputs=target_identify_preset, show_progress='hidden')
2344
+ target_identify_preset.change(
2345
+ fn=update_family, inputs=[target_identify_target_family, target_identify_preset],
2346
+ outputs=target_identify_target_family, show_progress='hidden')
2347
+
2348
  target_identify_click = target_identify_btn.click(
2349
  fn=target_identify_validate,
2350
  inputs=[compound_smiles, target_library, target_library_upload, target_identify_preset, target_identify_task,
 
2373
  target_identify_click.success(
2374
  fn=submit_predict,
2375
  inputs=[identify_data_for_predict, target_identify_task, target_identify_preset,
2376
+ target_identify_target_family, target_identify_opts, run_state, ], # , target_identify_email],
2377
  outputs=[run_state, ]
2378
  )
2379
 
 
2448
  report_df_change = file_for_report.change(
2449
  fn=update_df, inputs=file_for_report, outputs=[html_report, raw_df, report_df, analyze_btn, report_task],
2450
  concurrency_limit=100,
2451
+ ).then(
2452
+ fn=lambda: [gr.Button(interactive=True)] * 2,
2453
+ outputs=[csv_generate, html_generate],
2454
  )
2455
 
2456
  file_for_report.upload(
 
2465
  file_for_report.clear(
2466
  fn=lambda: [gr.Button(interactive=False)] * 3 +
2467
  [gr.File(visible=False, value=None)] * 2 +
2468
+ [gr.Dropdown(visible=False, value=None), gr.HTML(visible=False)],
2469
+ cancels=[report_df_change],
2470
  outputs=[
2471
  csv_generate, html_generate, analyze_btn, csv_download_file, html_download_file, report_task, html_report
2472
  ]
 
2485
  outputs=analyze_btn)
2486
 
2487
 
2488
+ def create_csv_report_file(df, file_report, task, sep, progress=gr.Progress(track_tqdm=True)):
2489
+ csv_sep_map = {
2490
+ 'Comma': ',',
2491
+ 'Tab': '\t',
2492
+ }
2493
+ Y_colname = 'Y^'
2494
+ if isinstance(task, str):
2495
+ if task == 'Compound-Protein Interaction':
2496
+ Y_colname = 'Y^_pIC50',
2497
+ elif task == 'Compound-Protein Binding Affinity':
2498
+ Y_colname = 'Y^_prob'
2499
  try:
2500
  now = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
2501
+ filename = f"{SERVER_DATA_DIR}/{Path(file_report.name).stem}_DeepSEQreen_report_{now}.csv"
2502
+ df.rename(columns={'Y^': Y_colname}).drop(
2503
+ labels=['Compound', 'Scaffold'], axis=1
2504
+ ).to_csv(filename, index=False, na_rep='', sep=csv_sep_map[sep])
2505
 
2506
  return gr.File(filename)
2507
  except Exception as e:
 
2509
  return None
2510
 
2511
 
2512
+ def create_html_report_file(df, file_report, task, opts, progress=gr.Progress(track_tqdm=True)):
2513
  try:
2514
  now = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
2515
+ filename = f"{SERVER_DATA_DIR}/{Path(file_report.name).stem}_DeepSEQreen_report_{now}.html"
2516
+ create_html_report(df, filename, task, opts)
2517
  return gr.File(filename, visible=True)
2518
  except Exception as e:
2519
  gr.Warning(f"Failed to generate HTML due to error: {str(e)}")
2520
  return None
2521
 
2522
 
2523
+ # html_report.change(lambda: [gr.Button(visible=True)] * 2, outputs=[csv_generate, html_generate])
2524
+
2525
  csv_generate.click(
2526
+ lambda: [gr.File(visible=True)], outputs=[csv_download_file],
2527
+ ).then(fn=create_csv_report_file, inputs=[report_df, file_for_report, report_task, csv_sep],
2528
  outputs=csv_download_file, show_progress='full')
2529
  html_generate.click(
2530
+ lambda: [gr.File(visible=True)], outputs=[html_download_file],
2531
+ ).then(fn=create_html_report_file, inputs=[report_df, file_for_report, report_task, html_opts],
2532
  outputs=html_download_file, show_progress='full')
2533
 
2534
+
2535
  if __name__ == "__main__":
2536
+ pandarallel.initialize()
2537
  hydra.initialize(version_base="1.3", config_path="configs", job_name="webserver_inference")
 
2538
  demo.queue(default_concurrency_limit=None, max_size=10).launch(show_api=False)
2539
+ scheduler.add_job(check_expiry, 'interval', hours=1)
2540
+ scheduler.start()