Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Update app.py
Browse files
app.py
CHANGED
@@ -278,36 +278,70 @@ def check_expiry():
|
|
278 |
|
279 |
|
280 |
def max_tanimoto_similarity(smi, seen_smiles_with_fp):
|
281 |
-
if smi is None:
|
282 |
-
return 0
|
|
|
283 |
if smi in seen_smiles_with_fp['X1'].values:
|
284 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
285 |
mol = Chem.MolFromSmiles(smi)
|
286 |
if mol is None:
|
287 |
-
return 0
|
|
|
288 |
mol_ecfp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048)
|
289 |
sims = BulkTanimotoSimilarity(mol_ecfp, seen_smiles_with_fp['FP'])
|
290 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
291 |
|
292 |
|
293 |
def max_sequence_identity(seq, seen_fastas):
|
294 |
-
if seq is None:
|
295 |
-
return 0
|
296 |
-
|
297 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
298 |
aligner = PairwiseAligner()
|
299 |
aligner.mode = 'local'
|
300 |
-
|
301 |
-
|
|
|
302 |
alignment = aligner.align(seq, fasta)
|
303 |
identity = alignment.score / max(len(seq), len(fasta))
|
304 |
if identity == 1:
|
305 |
-
|
306 |
-
|
307 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
308 |
|
309 |
|
310 |
-
@cache
|
311 |
def get_seen_smiles(family, task):
|
312 |
if family == 'General':
|
313 |
family = 'all_families_full'
|
@@ -318,7 +352,6 @@ def get_seen_smiles(family, task):
|
|
318 |
return seen_smiles
|
319 |
|
320 |
|
321 |
-
@cache
|
322 |
def get_seen_fastas(family, task):
|
323 |
if family == 'General':
|
324 |
family = 'all_families_full'
|
@@ -717,7 +750,8 @@ def submit_predict(predict_filepath, task, preset, target_family, opts, job_info
|
|
717 |
return row['Target Family']
|
718 |
# If no exact match, then calculate alignment score
|
719 |
else:
|
720 |
-
aligner = PairwiseAligner(
|
|
|
721 |
|
722 |
def align_score(target):
|
723 |
alignment = aligner.align(query, target)
|
@@ -827,24 +861,62 @@ def submit_predict(predict_filepath, task, preset, target_family, opts, job_info
|
|
827 |
prediction_df = prediction_df.merge(orig_df, on=['X1', 'X2'], how='left', indicator=False)
|
828 |
prediction_df = pd.concat([prediction_df, annotated_df], ignore_index=True)
|
829 |
|
830 |
-
if "Include Max. Tanimoto Similarity" in opts:
|
831 |
for family in prediction_df['Target Family'].unique():
|
832 |
family_smiles_df = get_seen_smiles(family=family, task=task_value)
|
833 |
family_smiles_df['FP'] = family_smiles_df['X1'].parallel_apply(
|
834 |
lambda smiles: AllChem.GetMorganFingerprintAsBitVect(
|
835 |
Chem.MolFromSmiles(smiles), radius=2, nBits=2048)
|
836 |
)
|
837 |
-
|
838 |
-
|
|
|
|
|
|
|
|
|
|
|
839 |
prediction_df.loc[prediction_df['Target Family'] == family, 'X1'].parallel_apply(max_sim)
|
840 |
)
|
841 |
max_sim.cache_clear()
|
842 |
|
843 |
-
if "Include Max.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
844 |
for family in prediction_df['Target Family'].unique():
|
845 |
family_fastas_df = get_seen_fastas(family=family, task=task_value)
|
846 |
-
|
847 |
-
|
|
|
|
|
|
|
|
|
|
|
848 |
prediction_df.loc[prediction_df['Target Family'] == family, 'X2'].parallel_apply(max_id)
|
849 |
)
|
850 |
max_id.cache_clear()
|
@@ -1499,10 +1571,18 @@ with gr.Blocks(theme=theme, title='DeepSEQreen', css=CSS, delete_cache=(3600, 48
|
|
1499 |
drug_library_upload = gr.File(label='Custom compound library file', visible=False)
|
1500 |
|
1501 |
drug_screen_opts = gr.CheckboxGroup(
|
1502 |
-
|
1503 |
-
|
1504 |
-
|
1505 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1506 |
)
|
1507 |
with gr.Row():
|
1508 |
with gr.Column():
|
@@ -1516,7 +1596,6 @@ with gr.Blocks(theme=theme, title='DeepSEQreen', css=CSS, delete_cache=(3600, 48
|
|
1516 |
with gr.Row():
|
1517 |
drug_screen_clr_btn = gr.ClearButton(size='lg')
|
1518 |
drug_screen_btn = gr.Button(value='SUBMIT THE SCREENING JOB', variant='primary', size='lg')
|
1519 |
-
# TODO Modify the pd df directly with df['X2'] = target
|
1520 |
|
1521 |
screen_data_for_predict = gr.File(visible=False, file_count="single", type='filepath')
|
1522 |
|
@@ -1606,10 +1685,11 @@ with gr.Blocks(theme=theme, title='DeepSEQreen', css=CSS, delete_cache=(3600, 48
|
|
1606 |
target_library_upload = gr.File(label='Custom target library file', visible=False)
|
1607 |
|
1608 |
target_identify_opts = gr.CheckboxGroup(
|
1609 |
-
['Include Max. Sequence Identity'],
|
1610 |
label='Step 6. Select Additional Options',
|
1611 |
-
info="
|
1612 |
-
"
|
|
|
1613 |
)
|
1614 |
with gr.Row():
|
1615 |
with gr.Column():
|
@@ -1934,6 +2014,14 @@ with gr.Blocks(theme=theme, title='DeepSEQreen', css=CSS, delete_cache=(3600, 48
|
|
1934 |
x.name, gr.Dropdown(value=Path(x.name).name, choices=list(DRUG_LIBRARY_MAP.keys()) + [Path(x.name).name])
|
1935 |
], inputs=drug_library_upload_btn, outputs=[drug_library_upload, drug_library])
|
1936 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1937 |
|
1938 |
def example_fill(input_type):
|
1939 |
return {target_id: 'Q16539',
|
@@ -2558,14 +2646,14 @@ QALAHAYFAQYHDPDDEPVADPYDQSFESRDLLIDEWKSLTYDEVISFVPPPLDQEEMES
|
|
2558 |
|
2559 |
if __name__ == "__main__":
|
2560 |
pandarallel.initialize()
|
2561 |
-
|
2562 |
hydra.initialize(version_base="1.3", config_path="configs", job_name="webserver_inference")
|
2563 |
-
|
2564 |
session = requests.Session()
|
2565 |
ADAPTER = HTTPAdapter(max_retries=Retry(total=5, backoff_factor=0.1, status_forcelist=[500, 502, 503, 504]))
|
2566 |
session.mount('http://', ADAPTER)
|
2567 |
session.mount('https://', ADAPTER)
|
2568 |
-
|
2569 |
db = TinyDB(f'{SERVER_DATA_DIR}/db.json')
|
2570 |
# Set all RUNNING jobs to FAILED at TinyDB initialization
|
2571 |
Job = Query()
|
@@ -2573,9 +2661,9 @@ if __name__ == "__main__":
|
|
2573 |
for job in jobs:
|
2574 |
if job['status'] == 'RUNNING':
|
2575 |
db.update({'status': 'FAILED'}, Job.id == job['id'])
|
2576 |
-
|
2577 |
scheduler = BackgroundScheduler()
|
2578 |
scheduler.add_job(check_expiry, 'interval', hours=1)
|
2579 |
scheduler.start()
|
2580 |
-
|
2581 |
demo.queue(default_concurrency_limit=None, max_size=10).launch(show_api=False)
|
|
|
278 |
|
279 |
|
280 |
def max_tanimoto_similarity(smi, seen_smiles_with_fp):
|
281 |
+
if smi is None or seen_smiles_with_fp is None or seen_smiles_with_fp.empty:
|
282 |
+
return {'Max. Tanimoto Similarity': 0, 'Max. Tanimoto Similarity Compound': None}
|
283 |
+
|
284 |
if smi in seen_smiles_with_fp['X1'].values:
|
285 |
+
compound = smi
|
286 |
+
if 'ID1' in seen_smiles_with_fp.columns:
|
287 |
+
id1 = seen_smiles_with_fp.loc[seen_smiles_with_fp['X1'] == smi, 'ID1'].values[0]
|
288 |
+
if pd.notnull(id1) and id1 != '':
|
289 |
+
compound = id1
|
290 |
+
return {'Max. Tanimoto Similarity': 1, 'Max. Tanimoto Similarity Compound': compound}
|
291 |
+
|
292 |
mol = Chem.MolFromSmiles(smi)
|
293 |
if mol is None:
|
294 |
+
return {'Max. Tanimoto Similarity': 0, 'Max. Tanimoto Similarity Compound': None}
|
295 |
+
|
296 |
mol_ecfp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048)
|
297 |
sims = BulkTanimotoSimilarity(mol_ecfp, seen_smiles_with_fp['FP'])
|
298 |
+
idx = sims.argmax()
|
299 |
+
compound = seen_smiles_with_fp.iloc[idx]['X1']
|
300 |
+
if 'ID1' in seen_smiles_with_fp.columns:
|
301 |
+
id1 = seen_smiles_with_fp.iloc[idx]['ID1']
|
302 |
+
if pd.notnull(id1) and id1 != '':
|
303 |
+
compound = id1
|
304 |
+
|
305 |
+
return {'Max. Tanimoto Similarity': sims[idx], 'Max. Tanimoto Similarity Compound': compound}
|
306 |
|
307 |
|
308 |
def max_sequence_identity(seq, seen_fastas):
|
309 |
+
if seq is None or seen_fastas is None or seen_fastas.empty:
|
310 |
+
return {'Max. Sequence Identity': 0, 'Max. Sequence Identity Target': None}
|
311 |
+
|
312 |
+
if seq in seen_fastas['X2'].values:
|
313 |
+
target = seq
|
314 |
+
if 'ID2' in seen_fastas.columns:
|
315 |
+
id2 = seen_fastas.loc[seen_fastas['X2'] == seq, 'ID2'].values[0]
|
316 |
+
if pd.notnull(id2) and id2 != '':
|
317 |
+
target = id2
|
318 |
+
return {'Max. Sequence Identity': 1, 'Max. Sequence Identity Target': target}
|
319 |
+
|
320 |
aligner = PairwiseAligner()
|
321 |
aligner.mode = 'local'
|
322 |
+
max_iden = 0
|
323 |
+
target = None
|
324 |
+
for fasta in seen_fastas['X2'].values:
|
325 |
alignment = aligner.align(seq, fasta)
|
326 |
identity = alignment.score / max(len(seq), len(fasta))
|
327 |
if identity == 1:
|
328 |
+
target = fasta
|
329 |
+
if 'ID2' in seen_fastas.columns:
|
330 |
+
id2 = seen_fastas.loc[seen_fastas['X2'] == fasta, 'ID2'].values[0]
|
331 |
+
if pd.notnull(id2) and id2 != '':
|
332 |
+
target = id2
|
333 |
+
return {'Max. Sequence Identity': 1, 'Max. Sequence Identity Target': target}
|
334 |
+
if identity > max_iden:
|
335 |
+
max_iden = identity
|
336 |
+
target = fasta
|
337 |
+
if 'ID2' in seen_fastas.columns:
|
338 |
+
id2 = seen_fastas.loc[seen_fastas['X2'] == fasta, 'ID2'].values[0]
|
339 |
+
if pd.notnull(id2) and id2 != '':
|
340 |
+
target = id2
|
341 |
+
|
342 |
+
return {'Max. Sequence Identity': max_iden, 'Max. Sequence Identity Target': target}
|
343 |
|
344 |
|
|
|
345 |
def get_seen_smiles(family, task):
|
346 |
if family == 'General':
|
347 |
family = 'all_families_full'
|
|
|
352 |
return seen_smiles
|
353 |
|
354 |
|
|
|
355 |
def get_seen_fastas(family, task):
|
356 |
if family == 'General':
|
357 |
family = 'all_families_full'
|
|
|
750 |
return row['Target Family']
|
751 |
# If no exact match, then calculate alignment score
|
752 |
else:
|
753 |
+
aligner = PairwiseAligner()
|
754 |
+
aligner.mode = 'local'
|
755 |
|
756 |
def align_score(target):
|
757 |
alignment = aligner.align(query, target)
|
|
|
861 |
prediction_df = prediction_df.merge(orig_df, on=['X1', 'X2'], how='left', indicator=False)
|
862 |
prediction_df = pd.concat([prediction_df, annotated_df], ignore_index=True)
|
863 |
|
864 |
+
if "Include Compound Max. Tanimoto Similarity to Training Compounds" in opts:
|
865 |
for family in prediction_df['Target Family'].unique():
|
866 |
family_smiles_df = get_seen_smiles(family=family, task=task_value)
|
867 |
family_smiles_df['FP'] = family_smiles_df['X1'].parallel_apply(
|
868 |
lambda smiles: AllChem.GetMorganFingerprintAsBitVect(
|
869 |
Chem.MolFromSmiles(smiles), radius=2, nBits=2048)
|
870 |
)
|
871 |
+
|
872 |
+
@cache
|
873 |
+
def max_sim(smi):
|
874 |
+
return max_tanimoto_similarity(smi, family_smiles_df)['Max. Tanimoto Similarity']
|
875 |
+
|
876 |
+
prediction_df.loc[
|
877 |
+
prediction_df['Target Family'] == family, 'Max. Tanimoto Similarity to Training Compounds'] = (
|
878 |
prediction_df.loc[prediction_df['Target Family'] == family, 'X1'].parallel_apply(max_sim)
|
879 |
)
|
880 |
max_sim.cache_clear()
|
881 |
|
882 |
+
if "Include Compound Max. Tanimoto Similarity to Known Interacting Compounds of Target" in opts:
|
883 |
+
x2 = prediction_df['X2'].iloc[0]
|
884 |
+
pos_compounds_df = df_training.loc[(df_training['X2'] == x2) & (df_training['Y'] == 1)]
|
885 |
+
pos_compounds_df['FP'] = pos_compounds_df['X1'].parallel_apply(
|
886 |
+
lambda smiles: AllChem.GetMorganFingerprintAsBitVect(
|
887 |
+
Chem.MolFromSmiles(smiles), radius=2, nBits=2048)
|
888 |
+
)
|
889 |
+
max_sim = cache(partial(max_tanimoto_similarity, seen_smiles_with_fp=pos_compounds_df))
|
890 |
+
prediction_df[['Max. Tanimoto Similarity', 'Max. Tanimoto Similarity Compound']] = (
|
891 |
+
prediction_df['X1'].parallel_apply(max_sim).apply(pd.Series)
|
892 |
+
)
|
893 |
+
max_sim.cache_clear()
|
894 |
+
|
895 |
+
if "Include Target Max. Sequence Identity to Known Interacting Targets of Compound" in opts:
|
896 |
+
x2 = prediction_df['X2'].iloc[0]
|
897 |
+
prediction_df['X1^'] = prediction_df['X1'].parallel_apply(rdkit_canonicalize)
|
898 |
+
|
899 |
+
@cache
|
900 |
+
def calculate_max_sequence_identity(compound):
|
901 |
+
compound_targets = df_training.loc[df_training['X1'] == compound]
|
902 |
+
return max_sequence_identity(x2, seen_fastas=compound_targets)
|
903 |
+
|
904 |
+
prediction_df[['Max. Sequence Identity', 'Max. Sequence Identity Target']] = (
|
905 |
+
prediction_df['X1^'].parallel_apply(calculate_max_sequence_identity).apply(pd.Series)
|
906 |
+
)
|
907 |
+
|
908 |
+
calculate_max_sequence_identity.cache_clear()
|
909 |
+
|
910 |
+
if "Include Target Max. Sequence Identity to Training Targets" in opts:
|
911 |
for family in prediction_df['Target Family'].unique():
|
912 |
family_fastas_df = get_seen_fastas(family=family, task=task_value)
|
913 |
+
|
914 |
+
@cache
|
915 |
+
def max_id(seq):
|
916 |
+
return max_sequence_identity(seq, seen_fastas=family_fastas_df)['Max. Sequence Identity']
|
917 |
+
|
918 |
+
prediction_df.loc[
|
919 |
+
prediction_df['Target Family'] == family, 'Max. Sequence Identity to Training Targets'] = (
|
920 |
prediction_df.loc[prediction_df['Target Family'] == family, 'X2'].parallel_apply(max_id)
|
921 |
)
|
922 |
max_id.cache_clear()
|
|
|
1571 |
drug_library_upload = gr.File(label='Custom compound library file', visible=False)
|
1572 |
|
1573 |
drug_screen_opts = gr.CheckboxGroup(
|
1574 |
+
label="Step 6. Select Additional Options",
|
1575 |
+
choices=[
|
1576 |
+
'Include Compound Max. Tanimoto Similarity to Training Compounds',
|
1577 |
+
'Include Target Max. Sequence Identity to Known Interacting Targets of Compound',
|
1578 |
+
'Include Compound Max. Tanimoto Similarity to Known Interacting Compounds of Target',
|
1579 |
+
],
|
1580 |
+
info="These are experimental features and may increase the job computation time. "
|
1581 |
+
"Compound Max. Tanimoto Similarity to Training Compounds and "
|
1582 |
+
"Target Max. Sequence Identity to Known Interacting Targets of Compound "
|
1583 |
+
"are indicative of the predictive reliability of the model (the higher the more reliable), "
|
1584 |
+
"while Compound Max. Tanimoto Similarity to Known Interacting Compounds of Target "
|
1585 |
+
"is indicative of the novelty of the compound (the lower the more novel)."
|
1586 |
)
|
1587 |
with gr.Row():
|
1588 |
with gr.Column():
|
|
|
1596 |
with gr.Row():
|
1597 |
drug_screen_clr_btn = gr.ClearButton(size='lg')
|
1598 |
drug_screen_btn = gr.Button(value='SUBMIT THE SCREENING JOB', variant='primary', size='lg')
|
|
|
1599 |
|
1600 |
screen_data_for_predict = gr.File(visible=False, file_count="single", type='filepath')
|
1601 |
|
|
|
1685 |
target_library_upload = gr.File(label='Custom target library file', visible=False)
|
1686 |
|
1687 |
target_identify_opts = gr.CheckboxGroup(
|
1688 |
+
['Include Target Max. Sequence Identity to Training Targets'],
|
1689 |
label='Step 6. Select Additional Options',
|
1690 |
+
info="These are experimental features and may increase the job computation time. "
|
1691 |
+
"Target Max. Sequence Identity to Training Targets is indicative of the "
|
1692 |
+
"predictive reliability of the model (the higher the more reliable)."
|
1693 |
)
|
1694 |
with gr.Row():
|
1695 |
with gr.Column():
|
|
|
2014 |
x.name, gr.Dropdown(value=Path(x.name).name, choices=list(DRUG_LIBRARY_MAP.keys()) + [Path(x.name).name])
|
2015 |
], inputs=drug_library_upload_btn, outputs=[drug_library_upload, drug_library])
|
2016 |
|
2017 |
+
drug_screen_task.select(
|
2018 |
+
fn=lambda task, opts: [opt for opt in opts if opt not in [
|
2019 |
+
'Include Compound Max. Tanimoto Similarity to Known Interacting Compounds of Target',
|
2020 |
+
'Include Target Max. Sequence Identity to Known Interacting Targets of Compound'
|
2021 |
+
]] if task == 'Compound-Protein Binding Affinity' else opts,
|
2022 |
+
inputs=[drug_screen_task, drug_screen_opts], outputs=drug_screen_opts,
|
2023 |
+
show_progress='hidden'
|
2024 |
+
)
|
2025 |
|
2026 |
def example_fill(input_type):
|
2027 |
return {target_id: 'Q16539',
|
|
|
2646 |
|
2647 |
if __name__ == "__main__":
|
2648 |
pandarallel.initialize()
|
2649 |
+
|
2650 |
hydra.initialize(version_base="1.3", config_path="configs", job_name="webserver_inference")
|
2651 |
+
|
2652 |
session = requests.Session()
|
2653 |
ADAPTER = HTTPAdapter(max_retries=Retry(total=5, backoff_factor=0.1, status_forcelist=[500, 502, 503, 504]))
|
2654 |
session.mount('http://', ADAPTER)
|
2655 |
session.mount('https://', ADAPTER)
|
2656 |
+
|
2657 |
db = TinyDB(f'{SERVER_DATA_DIR}/db.json')
|
2658 |
# Set all RUNNING jobs to FAILED at TinyDB initialization
|
2659 |
Job = Query()
|
|
|
2661 |
for job in jobs:
|
2662 |
if job['status'] == 'RUNNING':
|
2663 |
db.update({'status': 'FAILED'}, Job.id == job['id'])
|
2664 |
+
|
2665 |
scheduler = BackgroundScheduler()
|
2666 |
scheduler.add_job(check_expiry, 'interval', hours=1)
|
2667 |
scheduler.start()
|
2668 |
+
|
2669 |
demo.queue(default_concurrency_limit=None, max_size=10).launch(show_api=False)
|