Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Update app.py
Browse files
app.py
CHANGED
@@ -27,8 +27,8 @@ from pandarallel import pandarallel
|
|
27 |
import requests
|
28 |
from requests.adapters import HTTPAdapter, Retry
|
29 |
from markdown import markdown
|
30 |
-
from rdkit import Chem
|
31 |
-
from rdkit.Chem import Draw, RDConfig, PandasTools, Descriptors, rdMolDescriptors, rdmolops, Lipinski, Crippen
|
32 |
from rdkit.Chem.Scaffolds import MurckoScaffold
|
33 |
import seaborn as sns
|
34 |
|
@@ -196,6 +196,13 @@ TARGET_FAMILY_MAP = {
|
|
196 |
'Nuclear Receptor': 'nuclear_receptor',
|
197 |
'Ion Channel': 'ion_channel',
|
198 |
'Others': 'others',
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
199 |
}
|
200 |
|
201 |
TARGET_LIBRARY_MAP = {
|
@@ -247,7 +254,7 @@ def remove_job_record(job_id):
|
|
247 |
# Delete the job from the database
|
248 |
db.remove(Job.id == job_id)
|
249 |
# Delete the corresponding files
|
250 |
-
files = glob.glob(f"/
|
251 |
for file_path in files:
|
252 |
if os.path.exists(file_path):
|
253 |
os.remove(file_path)
|
@@ -265,7 +272,7 @@ def check_expiry():
|
|
265 |
# Delete the job from the database
|
266 |
db.remove(Job.id == job['id'])
|
267 |
# Delete the corresponding file
|
268 |
-
files = glob.glob(f"/
|
269 |
for file_path in files:
|
270 |
if os.path.exists(file_path):
|
271 |
os.remove(file_path)
|
@@ -278,8 +285,63 @@ def check_expiry():
|
|
278 |
send_email(job)
|
279 |
|
280 |
|
281 |
-
|
282 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
283 |
|
284 |
|
285 |
def lipinski(mol):
|
@@ -635,46 +697,155 @@ using the job id. You will also receive an email notification once the job is do
|
|
635 |
raise gr.Error(f'Failed to retrieve job status due to error: {str(e)}')
|
636 |
|
637 |
|
638 |
-
def submit_predict(predict_filepath, task, preset, target_family, state):
|
639 |
job_id = state['id']
|
640 |
status = "RUNNING"
|
641 |
error = None
|
642 |
task_file_abbr = {'Compound-Protein Interaction': 'CPI', 'Compound-Protein Binding Affinity': 'CPA'}
|
643 |
predictions_file = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
644 |
try:
|
645 |
-
target_family
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
646 |
|
647 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
648 |
|
649 |
-
|
650 |
-
|
651 |
-
|
652 |
-
|
653 |
-
|
654 |
-
|
655 |
-
|
656 |
-
|
657 |
-
f"ckpt_path=resources/checkpoints/{preset}-{task}-{target_family}.ckpt",
|
658 |
-
f"data.data_file='{str(predict_filepath)}'"])
|
659 |
-
# with concurrent.futures.ThreadPoolExecutor() as executor:
|
660 |
-
# future = executor.submit(predict, cfg)
|
661 |
-
# try:
|
662 |
-
# predictions, _ = future.result(timeout=4*60*60)
|
663 |
-
# except concurrent.futures.TimeoutError:
|
664 |
-
# raise gr.Error("Prediction timed out.")
|
665 |
-
predictions, _ = predict(cfg)
|
666 |
-
predictions = [pd.DataFrame(prediction) for prediction in predictions]
|
667 |
-
prediction_df = pd.concat([prediction_df, pd.concat(predictions, ignore_index=True)])
|
668 |
-
prediction_df.set_index('N', inplace=True)
|
669 |
-
orig_df = pd.read_csv(
|
670 |
-
predict_filepath,
|
671 |
-
usecols=lambda x: x not in ['X1', 'ID1', 'Compound', 'Scaffold', 'Scaffold SMILES',
|
672 |
-
'X2', 'ID2',
|
673 |
-
'Y', 'Y^']
|
674 |
-
)
|
675 |
-
prediction_df = pd.merge(prediction_df, orig_df, left_index=True, right_index=True, how='left')
|
676 |
|
677 |
-
prediction_df.to_csv(predictions_file)
|
678 |
status = "COMPLETED"
|
679 |
|
680 |
return {run_state: False}
|
@@ -714,19 +885,21 @@ def update_df(file, progress=gr.Progress(track_tqdm=True)):
|
|
714 |
task = 'Compound-Protein Binding Affinity'
|
715 |
|
716 |
df = pd.read_csv(file)
|
|
|
717 |
if 'N' in df.columns:
|
718 |
df.set_index('N', inplace=True)
|
|
|
719 |
if not any(col in ['X1', 'X2'] for col in df.columns):
|
720 |
gr.Warning("At least one of columns `X1` and `X2` must be in the uploaded dataset.")
|
721 |
return {analyze_btn: gr.Button(interactive=False)}
|
|
|
722 |
if 'X1' in df.columns:
|
723 |
-
df['Scaffold SMILES'] = df['X1'].parallel_apply(MurckoScaffold.MurckoScaffoldSmilesFromSmiles)
|
724 |
-
df['Scaffold'] = df['Scaffold SMILES'].parallel_apply(
|
725 |
-
lambda smiles: PandasTools._MolPlusFingerprint(Chem.MolFromSmiles(smiles)))
|
726 |
-
# Add a new column with RDKit molecule objects
|
727 |
if 'Compound' not in df.columns or df['Compound'].dtype != 'object':
|
728 |
df['Compound'] = df['X1'].parallel_apply(
|
729 |
lambda smiles: PandasTools._MolPlusFingerprint(Chem.MolFromSmiles(smiles)))
|
|
|
|
|
|
|
730 |
|
731 |
# DF_FOR_REPORT = df.copy()
|
732 |
|
@@ -752,7 +925,7 @@ def update_df(file, progress=gr.Progress(track_tqdm=True)):
|
|
752 |
return {analyze_btn: gr.Button(interactive=False)}
|
753 |
|
754 |
|
755 |
-
def create_html_report(df, file=None, task=None, progress=gr.Progress(track_tqdm=True)):
|
756 |
df_html = df.copy(deep=True)
|
757 |
column_aliases = COLUMN_ALIASES.copy()
|
758 |
cols_left = list(pd.Index(
|
@@ -763,9 +936,9 @@ def create_html_report(df, file=None, task=None, progress=gr.Progress(track_tqdm
|
|
763 |
if isinstance(task, str):
|
764 |
column_aliases.update({
|
765 |
'Y': 'Actual Interaction Probability' if task == 'Compound-Protein Interaction'
|
766 |
-
else 'Actual Binding Affinity',
|
767 |
'Y^': 'Predicted Interaction Probability' if task == 'Compound-Protein Interaction'
|
768 |
-
else 'Predicted Binding Affinity'
|
769 |
})
|
770 |
|
771 |
ascending = True if column_aliases['Y^'] == 'Predicted Binding Affinity' else False
|
@@ -803,12 +976,17 @@ def create_html_report(df, file=None, task=None, progress=gr.Progress(track_tqdm
|
|
803 |
|
804 |
elif 'Y^' in df_html.columns:
|
805 |
job = 'Interaction Pair Inference'
|
806 |
-
if 'Compound' in df_html.columns:
|
807 |
df_html['Compound'] = df_html['Compound'].parallel_apply(
|
808 |
lambda x: PandasTools.PrintAsImageString(x) if not pd.isna(x) else x)
|
809 |
-
|
|
|
|
|
|
|
810 |
df_html['Scaffold'] = df_html['Scaffold'].parallel_apply(
|
811 |
lambda x: PandasTools.PrintAsImageString(x) if not pd.isna(x) else x)
|
|
|
|
|
812 |
|
813 |
df_html.rename(columns=column_aliases, inplace=True)
|
814 |
df_html.index.name = 'Index'
|
@@ -1276,7 +1454,7 @@ with gr.Blocks(theme=theme, title='DeepSEQreen', css=CSS, delete_cache=(3600, 48
|
|
1276 |
"Interaction prediction provides you binding probability score between the target of "
|
1277 |
"interest and each compound in the library, "
|
1278 |
"while affinity prediction directly estimates their binding strength measured using "
|
1279 |
-
"
|
1280 |
)
|
1281 |
drug_screen_task = gr.Dropdown(
|
1282 |
list(TASK_MAP.keys()),
|
@@ -1313,17 +1491,24 @@ with gr.Blocks(theme=theme, title='DeepSEQreen', css=CSS, delete_cache=(3600, 48
|
|
1313 |
drug_library_upload_btn = gr.UploadButton(
|
1314 |
label='OR Upload Your Own Library', variant='primary')
|
1315 |
drug_library_upload = gr.File(label='Custom compound library file', visible=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1316 |
with gr.Row():
|
1317 |
with gr.Column():
|
1318 |
drug_screen_email = gr.Textbox(
|
1319 |
-
label='Step
|
1320 |
info="Your email address will be used to notify you of the status of your job. "
|
1321 |
"If you cannot receive the email, please check your spam/junk folder."
|
1322 |
)
|
1323 |
|
1324 |
with gr.Row(visible=True):
|
1325 |
with gr.Column():
|
1326 |
-
|
1327 |
drug_screen_btn = gr.Button(value='SUBMIT THE SCREENING JOB', variant='primary', size='lg')
|
1328 |
# TODO Modify the pd df directly with df['X2'] = target
|
1329 |
|
@@ -1359,26 +1544,25 @@ with gr.Blocks(theme=theme, title='DeepSEQreen', css=CSS, delete_cache=(3600, 48
|
|
1359 |
example_drug = gr.Button(value='Example: Aspirin', elem_classes='example')
|
1360 |
|
1361 |
with gr.Row():
|
1362 |
-
with gr.Column(visible=
|
1363 |
HelpTip(
|
1364 |
"By default, models trained on all protein families (general) will be applied. "
|
1365 |
-
|
1366 |
-
|
1367 |
)
|
1368 |
target_identify_target_family = gr.Dropdown(
|
1369 |
-
choices=['
|
1370 |
-
|
1371 |
-
|
1372 |
-
with gr.Row():
|
1373 |
with gr.Column():
|
1374 |
HelpTip(
|
1375 |
"Interaction prediction provides you binding probability score between the target of "
|
1376 |
"interest and each compound in the library, while affinity prediction directly "
|
1377 |
-
"estimates their binding strength measured using
|
1378 |
)
|
1379 |
target_identify_task = gr.Dropdown(
|
1380 |
list(TASK_MAP.keys()),
|
1381 |
-
label='Step
|
1382 |
value='Compound-Protein Interaction')
|
1383 |
|
1384 |
with gr.Column():
|
@@ -1389,8 +1573,8 @@ with gr.Blocks(theme=theme, title='DeepSEQreen', css=CSS, delete_cache=(3600, 48
|
|
1389 |
"Please refer to the documentation for detailed benchmark results."
|
1390 |
)
|
1391 |
target_identify_preset = gr.Dropdown(
|
1392 |
-
list(PRESET_MAP.keys()),
|
1393 |
-
label='Step
|
1394 |
identify_preset_recommend_btn = gr.Button(value='OR Let Us Recommend for You',
|
1395 |
variant='primary')
|
1396 |
with gr.Row():
|
@@ -1403,7 +1587,7 @@ with gr.Blocks(theme=theme, title='DeepSEQreen', css=CSS, delete_cache=(3600, 48
|
|
1403 |
"and can be downloaded by clicking the lower right corner."
|
1404 |
)
|
1405 |
target_library = gr.Dropdown(
|
1406 |
-
label='Step
|
1407 |
choices=list(TARGET_LIBRARY_MAP.keys()))
|
1408 |
with gr.Row():
|
1409 |
gr.File(label='Example FASTA target library',
|
@@ -1414,16 +1598,23 @@ with gr.Blocks(theme=theme, title='DeepSEQreen', css=CSS, delete_cache=(3600, 48
|
|
1414 |
label='OR Upload Your Own Library', variant='primary')
|
1415 |
target_library_upload = gr.File(label='Custom target library file', visible=False)
|
1416 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1417 |
with gr.Row():
|
1418 |
with gr.Column():
|
1419 |
target_identify_email = gr.Textbox(
|
1420 |
-
label='Step
|
1421 |
info="Your email address will be used to notify you of the status of your job. "
|
1422 |
"If you cannot receive the email, please check your spam/junk folder."
|
1423 |
)
|
1424 |
|
1425 |
with gr.Row(visible=True):
|
1426 |
-
|
1427 |
target_identify_btn = gr.Button(value='SUBMIT THE IDENTIFICATION JOB', variant='primary',
|
1428 |
size='lg')
|
1429 |
|
@@ -1501,7 +1692,7 @@ with gr.Blocks(theme=theme, title='DeepSEQreen', css=CSS, delete_cache=(3600, 48
|
|
1501 |
"Interaction prediction provides you binding probability score "
|
1502 |
"between the target of interest and each compound in the library, "
|
1503 |
"while affinity prediction directly estimates their binding strength "
|
1504 |
-
"measured using
|
1505 |
)
|
1506 |
pair_infer_task = gr.Dropdown(
|
1507 |
list(TASK_MAP.keys()),
|
@@ -1525,7 +1716,7 @@ with gr.Blocks(theme=theme, title='DeepSEQreen', css=CSS, delete_cache=(3600, 48
|
|
1525 |
"If you cannot receive the email, please check your spam/junk folder.")
|
1526 |
|
1527 |
with gr.Row(visible=True):
|
1528 |
-
|
1529 |
pair_infer_btn = gr.Button(value='SUBMIT THE INFERENCE JOB', variant='primary', size='lg')
|
1530 |
|
1531 |
infer_data_for_predict = gr.File(file_count="single", type='filepath', visible=False)
|
@@ -1546,25 +1737,33 @@ with gr.Blocks(theme=theme, title='DeepSEQreen', css=CSS, delete_cache=(3600, 48
|
|
1546 |
Please first `Preview` the report, then `Generate` and download a CSV report
|
1547 |
or an interactive HTML report below if you wish to access the full report.
|
1548 |
''')
|
|
|
|
|
1549 |
with gr.Row():
|
1550 |
-
with gr.Column():
|
1551 |
file_for_report = gr.File(interactive=True, type='filepath')
|
1552 |
report_task = gr.Dropdown(list(TASK_MAP.keys()), visible=False, value=None,
|
1553 |
-
label='Specify the Task Labels in the
|
1554 |
-
|
1555 |
-
|
1556 |
-
|
1557 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
1558 |
|
1559 |
with gr.Row():
|
1560 |
-
|
1561 |
-
analyze_btn = gr.Button('
|
1562 |
-
interactive=False)
|
1563 |
|
1564 |
with gr.Row():
|
1565 |
with gr.Column(scale=3):
|
1566 |
html_report = gr.HTML() # label='Results', visible=True)
|
1567 |
-
|
1568 |
|
1569 |
with gr.Row():
|
1570 |
with gr.Column():
|
@@ -1584,8 +1783,8 @@ with gr.Blocks(theme=theme, title='DeepSEQreen', css=CSS, delete_cache=(3600, 48
|
|
1584 |
if the job has completed. Note that predictions are only kept for 48 hours upon job completion.
|
1585 |
|
1586 |
You will be redirected to Chemical Property Report for carrying out further analysis and
|
1587 |
-
generating the full report
|
1588 |
-
|
1589 |
''')
|
1590 |
with gr.Column():
|
1591 |
pred_lookup_id = gr.Textbox(
|
@@ -1689,8 +1888,8 @@ with gr.Blocks(theme=theme, title='DeepSEQreen', css=CSS, delete_cache=(3600, 48
|
|
1689 |
|
1690 |
def target_family_detect(fasta, progress=gr.Progress(track_tqdm=True)):
|
1691 |
try:
|
1692 |
-
aligner = PairwiseAligner(
|
1693 |
-
alignment_df =
|
1694 |
|
1695 |
processed_fasta = process_target_fasta(fasta)
|
1696 |
|
@@ -1698,18 +1897,20 @@ with gr.Blocks(theme=theme, title='DeepSEQreen', css=CSS, delete_cache=(3600, 48
|
|
1698 |
exact_match = alignment_df[alignment_df['X2'] == processed_fasta]
|
1699 |
if not exact_match.empty:
|
1700 |
row = exact_match.iloc[0]
|
1701 |
-
return gr.Dropdown(
|
1702 |
-
|
|
|
1703 |
|
1704 |
# If no exact match, then calculate alignment score
|
1705 |
def align_score(query):
|
1706 |
-
|
|
|
1707 |
|
1708 |
alignment_df['score'] = alignment_df['X2'].parallel_apply(align_score)
|
1709 |
row = alignment_df.loc[alignment_df['score'].idxmax()]
|
1710 |
-
return gr.Dropdown(value=row['
|
1711 |
-
info=f"Reason: Best
|
1712 |
-
f"with {row['ID2']} from family {row['
|
1713 |
except Exception as e:
|
1714 |
gr.Warning("Failed to detect the protein family due to error: " + str(e))
|
1715 |
|
@@ -1772,7 +1973,7 @@ QALAHAYFAQYHDPDDEPVADPYDQSFESRDLLIDEWKSLTYDEVISFVPPPLDQEEMES
|
|
1772 |
scenario_general = "Unseen Target"
|
1773 |
|
1774 |
seen_targets_family = pd.read_csv(
|
1775 |
-
f'data/benchmarks/seen_targets/{TARGET_FAMILY_MAP[family]}_{task.lower()}_random_split.csv')
|
1776 |
if process_target_fasta(fasta) in seen_targets_family['X2'].values:
|
1777 |
scenario_family = "Seen Target"
|
1778 |
else:
|
@@ -1787,12 +1988,16 @@ QALAHAYFAQYHDPDDEPVADPYDQSFESRDLLIDEWKSLTYDEVISFVPPPLDQEEMES
|
|
1787 |
filtered_df = pd.concat([filtered_df_general, filtered_df_family])
|
1788 |
|
1789 |
row = filtered_df.loc[filtered_df[score].idxmax()]
|
|
|
|
|
|
|
|
|
1790 |
|
1791 |
return {drug_screen_preset:
|
1792 |
gr.Dropdown(value=row['Model'],
|
1793 |
info=f"Reason: {row['Scenario']} in training; we recommend the {row['Type']}-trained "
|
1794 |
-
f"model with the best {score}
|
1795 |
-
f"
|
1796 |
drug_screen_target_family:
|
1797 |
gr.Dropdown(value='General') if row['Type'] == 'General' else gr.Dropdown(value=family)}
|
1798 |
|
@@ -1848,9 +2053,9 @@ QALAHAYFAQYHDPDDEPVADPYDQSFESRDLLIDEWKSLTYDEVISFVPPPLDQEEMES
|
|
1848 |
gr.Warning('Please enter a valid SMILES for model recommendation.')
|
1849 |
return None
|
1850 |
|
1851 |
-
|
1852 |
-
f'data/benchmarks/
|
1853 |
-
if rdkit_canonicalize(smiles) in
|
1854 |
scenario = "Seen Compound"
|
1855 |
else:
|
1856 |
scenario = "Unseen Compound"
|
@@ -1863,8 +2068,7 @@ QALAHAYFAQYHDPDDEPVADPYDQSFESRDLLIDEWKSLTYDEVISFVPPPLDQEEMES
|
|
1863 |
|
1864 |
return gr.Dropdown(value=row['Model'],
|
1865 |
info=f"Reason: {scenario} in training; choosing the model "
|
1866 |
-
f"with the best {score}
|
1867 |
-
f"in the {scenario} scenario.")
|
1868 |
|
1869 |
|
1870 |
identify_preset_recommend_btn.click(fn=identify_recommend_model,
|
@@ -1965,7 +2169,7 @@ QALAHAYFAQYHDPDDEPVADPYDQSFESRDLLIDEWKSLTYDEVISFVPPPLDQEEMES
|
|
1965 |
|
1966 |
job_id = str(uuid4())
|
1967 |
temp_file = Path(f'{SERVER_DATA_DIR}/{job_id}_input.csv').resolve()
|
1968 |
-
screen_df.to_csv(temp_file, index=False)
|
1969 |
if temp_file.is_file():
|
1970 |
job_info = common_job_initiate(job_id, 'Drug Hit Screening', email, request, task)
|
1971 |
return {screen_data_for_predict: str(temp_file),
|
@@ -1995,7 +2199,7 @@ QALAHAYFAQYHDPDDEPVADPYDQSFESRDLLIDEWKSLTYDEVISFVPPPLDQEEMES
|
|
1995 |
|
1996 |
job_id = str(uuid4())
|
1997 |
temp_file = Path(f'{SERVER_DATA_DIR}/{job_id}_input.csv').resolve()
|
1998 |
-
identify_df.to_csv(temp_file, index=False)
|
1999 |
if temp_file.is_file():
|
2000 |
job_info = common_job_initiate(job_id, 'Target Protein Identification', email, request, task)
|
2001 |
return {identify_data_for_predict: str(temp_file),
|
@@ -2043,7 +2247,7 @@ QALAHAYFAQYHDPDDEPVADPYDQSFESRDLLIDEWKSLTYDEVISFVPPPLDQEEMES
|
|
2043 |
f'than the allowed maximum {DATASET_MAX_LEN}.')
|
2044 |
|
2045 |
temp_file = Path(f'{SERVER_DATA_DIR}/{job_id}_input.csv').resolve()
|
2046 |
-
infer_df.to_csv(temp_file, index=False)
|
2047 |
|
2048 |
else:
|
2049 |
raise gr.Error('Should upload a compound-protein pair dataset, or '
|
@@ -2093,10 +2297,54 @@ QALAHAYFAQYHDPDDEPVADPYDQSFESRDLLIDEWKSLTYDEVISFVPPPLDQEEMES
|
|
2093 |
drug_screen_click.success(
|
2094 |
fn=submit_predict,
|
2095 |
inputs=[screen_data_for_predict, drug_screen_task, drug_screen_preset,
|
2096 |
-
drug_screen_target_family, run_state, ],
|
2097 |
outputs=[run_state, ]
|
2098 |
)
|
2099 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2100 |
target_identify_click = target_identify_btn.click(
|
2101 |
fn=target_identify_validate,
|
2102 |
inputs=[compound_smiles, target_library, target_library_upload, target_identify_preset, target_identify_task,
|
@@ -2125,7 +2373,7 @@ QALAHAYFAQYHDPDDEPVADPYDQSFESRDLLIDEWKSLTYDEVISFVPPPLDQEEMES
|
|
2125 |
target_identify_click.success(
|
2126 |
fn=submit_predict,
|
2127 |
inputs=[identify_data_for_predict, target_identify_task, target_identify_preset,
|
2128 |
-
target_identify_target_family, run_state, ], # , target_identify_email],
|
2129 |
outputs=[run_state, ]
|
2130 |
)
|
2131 |
|
@@ -2200,6 +2448,9 @@ QALAHAYFAQYHDPDDEPVADPYDQSFESRDLLIDEWKSLTYDEVISFVPPPLDQEEMES
|
|
2200 |
report_df_change = file_for_report.change(
|
2201 |
fn=update_df, inputs=file_for_report, outputs=[html_report, raw_df, report_df, analyze_btn, report_task],
|
2202 |
concurrency_limit=100,
|
|
|
|
|
|
|
2203 |
)
|
2204 |
|
2205 |
file_for_report.upload(
|
@@ -2214,8 +2465,8 @@ QALAHAYFAQYHDPDDEPVADPYDQSFESRDLLIDEWKSLTYDEVISFVPPPLDQEEMES
|
|
2214 |
file_for_report.clear(
|
2215 |
fn=lambda: [gr.Button(interactive=False)] * 3 +
|
2216 |
[gr.File(visible=False, value=None)] * 2 +
|
2217 |
-
[gr.Dropdown(visible=False, value=None),
|
2218 |
-
|
2219 |
outputs=[
|
2220 |
csv_generate, html_generate, analyze_btn, csv_download_file, html_download_file, report_task, html_report
|
2221 |
]
|
@@ -2234,11 +2485,23 @@ QALAHAYFAQYHDPDDEPVADPYDQSFESRDLLIDEWKSLTYDEVISFVPPPLDQEEMES
|
|
2234 |
outputs=analyze_btn)
|
2235 |
|
2236 |
|
2237 |
-
def create_csv_report_file(df, file_report, progress=gr.Progress(track_tqdm=True)):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2238 |
try:
|
2239 |
now = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
|
2240 |
-
filename = f"/
|
2241 |
-
df.
|
|
|
|
|
2242 |
|
2243 |
return gr.File(filename)
|
2244 |
except Exception as e:
|
@@ -2246,28 +2509,32 @@ QALAHAYFAQYHDPDDEPVADPYDQSFESRDLLIDEWKSLTYDEVISFVPPPLDQEEMES
|
|
2246 |
return None
|
2247 |
|
2248 |
|
2249 |
-
def create_html_report_file(df, file_report, task, progress=gr.Progress(track_tqdm=True)):
|
2250 |
try:
|
2251 |
now = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
|
2252 |
-
filename = f"/
|
2253 |
-
create_html_report(df, filename, task)
|
2254 |
return gr.File(filename, visible=True)
|
2255 |
except Exception as e:
|
2256 |
gr.Warning(f"Failed to generate HTML due to error: {str(e)}")
|
2257 |
return None
|
2258 |
|
2259 |
|
2260 |
-
html_report.change(lambda: [gr.Button(visible=True)] * 2, outputs=[csv_generate, html_generate])
|
|
|
2261 |
csv_generate.click(
|
2262 |
-
lambda: [gr.
|
2263 |
-
).then(fn=create_csv_report_file, inputs=[report_df, file_for_report],
|
2264 |
outputs=csv_download_file, show_progress='full')
|
2265 |
html_generate.click(
|
2266 |
-
lambda: [gr.
|
2267 |
-
).then(fn=create_html_report_file, inputs=[report_df, file_for_report, report_task],
|
2268 |
outputs=html_download_file, show_progress='full')
|
2269 |
|
|
|
2270 |
if __name__ == "__main__":
|
|
|
2271 |
hydra.initialize(version_base="1.3", config_path="configs", job_name="webserver_inference")
|
2272 |
-
pandarallel.initialize(progress_bar=True)
|
2273 |
demo.queue(default_concurrency_limit=None, max_size=10).launch(show_api=False)
|
|
|
|
|
|
27 |
import requests
|
28 |
from requests.adapters import HTTPAdapter, Retry
|
29 |
from markdown import markdown
|
30 |
+
from rdkit import Chem, DataStructs
|
31 |
+
from rdkit.Chem import Draw, RDConfig, PandasTools, Descriptors, rdMolDescriptors, rdmolops, Lipinski, Crippen, AllChem
|
32 |
from rdkit.Chem.Scaffolds import MurckoScaffold
|
33 |
import seaborn as sns
|
34 |
|
|
|
196 |
'Nuclear Receptor': 'nuclear_receptor',
|
197 |
'Ion Channel': 'ion_channel',
|
198 |
'Others': 'others',
|
199 |
+
# 'general': 'general',
|
200 |
+
# 'kinase': 'kinase',
|
201 |
+
# 'non-kinase enzyme': 'non_kinase_enzyme',
|
202 |
+
# 'membrane receptor': 'membrane_receptor',
|
203 |
+
# 'nuclear Receptor': 'nuclear_receptor',
|
204 |
+
# 'ion channel': 'ion_channel',
|
205 |
+
# 'others': 'others',
|
206 |
}
|
207 |
|
208 |
TARGET_LIBRARY_MAP = {
|
|
|
254 |
# Delete the job from the database
|
255 |
db.remove(Job.id == job_id)
|
256 |
# Delete the corresponding files
|
257 |
+
files = glob.glob(f"{SERVER_DATA_DIR}/{job_id}*")
|
258 |
for file_path in files:
|
259 |
if os.path.exists(file_path):
|
260 |
os.remove(file_path)
|
|
|
272 |
# Delete the job from the database
|
273 |
db.remove(Job.id == job['id'])
|
274 |
# Delete the corresponding file
|
275 |
+
files = glob.glob(f"{SERVER_DATA_DIR}/{job['id']}*")
|
276 |
for file_path in files:
|
277 |
if os.path.exists(file_path):
|
278 |
os.remove(file_path)
|
|
|
285 |
send_email(job)
|
286 |
|
287 |
|
288 |
+
@cache
|
289 |
+
def max_tanimoto_similarity(smi, seen_smiles):
|
290 |
+
if smi is None:
|
291 |
+
return 0
|
292 |
+
mol = Chem.MolFromSmiles(smi)
|
293 |
+
if mol is None:
|
294 |
+
return 0
|
295 |
+
mol_ecfp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048)
|
296 |
+
max_sim = 0
|
297 |
+
for smiles in seen_smiles:
|
298 |
+
mol_seen = Chem.MolFromSmiles(smiles)
|
299 |
+
mol_seen_ecfp = AllChem.GetMorganFingerprintAsBitVect(mol_seen, radius=2, nBits=2048)
|
300 |
+
sim = DataStructs.TanimotoSimilarity(mol_ecfp, mol_seen_ecfp)
|
301 |
+
if sim == 1:
|
302 |
+
return 1
|
303 |
+
max_sim = max(sim, max_sim)
|
304 |
+
return max_sim
|
305 |
+
|
306 |
+
|
307 |
+
@cache
|
308 |
+
def max_sequence_identity(seq, seen_fastas):
|
309 |
+
if seq is None:
|
310 |
+
return 0
|
311 |
+
aligner = PairwiseAligner()
|
312 |
+
aligner.mode = 'local'
|
313 |
+
max_id = 0
|
314 |
+
for fasta in seen_fastas:
|
315 |
+
alignment = aligner.align(seq, fasta)
|
316 |
+
identity = alignment.score / max(len(seq), len(fasta))
|
317 |
+
if identity == 1:
|
318 |
+
return 1
|
319 |
+
max_id = max(identity, max_id)
|
320 |
+
return max_id
|
321 |
+
|
322 |
+
|
323 |
+
@cache
|
324 |
+
def get_seen_smiles(family, task):
|
325 |
+
seen_smiles = pd.read_csv(
|
326 |
+
f'data/benchmarks/seen_compounds/{TARGET_FAMILY_MAP[family.title()]}_{task.lower()}_random_split.csv')
|
327 |
+
return seen_smiles['X1'].tolist()
|
328 |
+
|
329 |
+
|
330 |
+
@cache
|
331 |
+
def get_seen_fastas(family, task):
|
332 |
+
seen_fastas = pd.read_csv(
|
333 |
+
f'data/benchmarks/seen_targets/{TARGET_FAMILY_MAP[family.title()]}_{task.lower()}_random_split.csv')
|
334 |
+
return seen_fastas['X2'].tolist()
|
335 |
+
|
336 |
+
|
337 |
+
@cache
|
338 |
+
def get_fasta_family_map():
|
339 |
+
usecols = ['X2', 'ID2', 'Target Family']
|
340 |
+
fasta_family_map = pd.concat([
|
341 |
+
pd.read_csv('data/target_libraries/ChEMBL33_all_spe_single_prot_info.csv', usecols=usecols),
|
342 |
+
pd.read_csv('data/target_libraries/idmapping_not_in_chembl.csv', usecols=usecols)
|
343 |
+
]).drop_duplicates(subset=['X2'], keep='first')
|
344 |
+
return fasta_family_map
|
345 |
|
346 |
|
347 |
def lipinski(mol):
|
|
|
697 |
raise gr.Error(f'Failed to retrieve job status due to error: {str(e)}')
|
698 |
|
699 |
|
700 |
+
def submit_predict(predict_filepath, task, preset, target_family, opts, state):
|
701 |
job_id = state['id']
|
702 |
status = "RUNNING"
|
703 |
error = None
|
704 |
task_file_abbr = {'Compound-Protein Interaction': 'CPI', 'Compound-Protein Binding Affinity': 'CPA'}
|
705 |
predictions_file = None
|
706 |
+
|
707 |
+
df_training = pd.read_csv(f'data/complete_{TASK_MAP[task].lower()}_dataset.csv')
|
708 |
+
orig_df = pd.read_csv(predict_filepath)
|
709 |
+
alignment_df = get_fasta_family_map()
|
710 |
+
prediction_df = pd.DataFrame()
|
711 |
+
|
712 |
+
@cache
|
713 |
+
def detect_family(query):
|
714 |
+
# Check for an exact match first
|
715 |
+
exact_match = alignment_df[alignment_df['X2'] == query]
|
716 |
+
if not exact_match.empty:
|
717 |
+
row = exact_match.iloc[0]
|
718 |
+
return row['Target Family']
|
719 |
+
# If no exact match, then calculate alignment score
|
720 |
+
else:
|
721 |
+
aligner = PairwiseAligner(mode='local')
|
722 |
+
|
723 |
+
def align_score(target):
|
724 |
+
alignment = aligner.align(query, target)
|
725 |
+
return alignment.score / max(len(query), len(target))
|
726 |
+
|
727 |
+
alignment_df['score'] = alignment_df['X2'].apply(align_score)
|
728 |
+
row = alignment_df.loc[alignment_df['score'].idxmax()]
|
729 |
+
return row['Target Family']
|
730 |
+
|
731 |
+
if 'Target Family' not in orig_df.columns:
|
732 |
+
orig_df['Target Family'] = None
|
733 |
+
orig_df.loc[
|
734 |
+
orig_df['Target Family'].isna(), 'Target Family'
|
735 |
+
] = orig_df.loc[
|
736 |
+
orig_df['Target Family'].isna(), 'X2'
|
737 |
+
].parallel_apply(detect_family)
|
738 |
+
|
739 |
+
detect_family.cache_clear()
|
740 |
+
|
741 |
+
orig_df = orig_df.merge(df_training[['X1', 'X2', 'Y']], on=['X1', 'X2'], how='left', indicator=False)
|
742 |
+
annotated_df = orig_df[~orig_df['Y'].isna()].copy()
|
743 |
+
annotated_df.rename(columns={'Y': 'Y^'}, inplace=True)
|
744 |
+
annotated_df['Prediction Source'] = 'Training Data'
|
745 |
+
# Resave the unannotated data
|
746 |
+
unannotated_df = orig_df[orig_df['Y'].isna()].drop(['Y', 'Target Family'], axis=1)
|
747 |
+
if not unannotated_df.empty:
|
748 |
+
unannotated_df.to_csv(predict_filepath, index=False, na_rep='')
|
749 |
+
else:
|
750 |
+
annotated_df.to_csv(predictions_file, index=False, na_rep='')
|
751 |
+
status = "COMPLETED"
|
752 |
+
return {run_state: False}
|
753 |
+
|
754 |
+
columns_to_drop = ['ID1', 'Compound', 'Scaffold', 'Scaffold SMILES', 'ID2', 'Y', 'Y^']
|
755 |
+
columns_to_drop = [col for col in columns_to_drop if col in orig_df.columns]
|
756 |
+
orig_df.drop(columns_to_drop, axis=1, inplace=True)
|
757 |
+
|
758 |
try:
|
759 |
+
if target_family != 'Family-Specific Auto-Recommendation':
|
760 |
+
target_family_value = TARGET_FAMILY_MAP[target_family.title()]
|
761 |
+
task_value = TASK_MAP[task]
|
762 |
+
preset_value = PRESET_MAP[preset]
|
763 |
+
predictions_file = (f'{SERVER_DATA_DIR}/'
|
764 |
+
f'{job_id}_{task_file_abbr[task]}_{preset_value}_{target_family_value}_predictions.csv')
|
765 |
+
|
766 |
+
cfg = hydra.compose(
|
767 |
+
config_name="webserver_inference",
|
768 |
+
overrides=[f"task={task_value}",
|
769 |
+
f"preset={preset_value}",
|
770 |
+
f"ckpt_path=resources/checkpoints/{preset_value}-{task_value}-{target_family_value}.ckpt",
|
771 |
+
f"data.data_file='{str(predict_filepath)}'"])
|
772 |
+
|
773 |
+
predictions, _ = predict(cfg)
|
774 |
+
predictions = pd.concat([pd.DataFrame(prediction) for prediction in predictions], ignore_index=True)
|
775 |
+
predictions['Prediction Source'] = f'{preset} ({target_family})'
|
776 |
+
prediction_df = pd.concat([prediction_df, predictions])
|
777 |
|
778 |
+
else:
|
779 |
+
predictions_file = f'{SERVER_DATA_DIR}/{job_id}_{task_file_abbr[task]}_{preset}_auto_predictions.csv'
|
780 |
+
task_value = TASK_MAP[task]
|
781 |
+
score = TASK_METRIC_MAP[task]
|
782 |
+
benchmark_df = pd.read_csv(f'data/benchmarks/{task_value}_test_metrics.csv')
|
783 |
+
predict_df = pd.read_csv(predict_filepath)
|
784 |
+
|
785 |
+
for family, subset in predict_df.groupby('Target Family'):
|
786 |
+
predict_subset_filepath = f'{SERVER_DATA_DIR}/{job_id}_{family}_input.csv'
|
787 |
+
subset.to_csv(predict_subset_filepath, index=False, na_rep='')
|
788 |
+
seen_compounds = get_seen_smiles(family, task_value)
|
789 |
+
|
790 |
+
if subset['X1'].iloc[0] in seen_compounds['X1'].values:
|
791 |
+
scenario = "Seen Compound"
|
792 |
+
else:
|
793 |
+
scenario = "Unseen Compound"
|
794 |
+
|
795 |
+
filtered_df = benchmark_df[(benchmark_df['Family'] == target_family.title())
|
796 |
+
& (benchmark_df['Scenario'] == scenario)]
|
797 |
+
|
798 |
+
preset = filtered_df.loc[filtered_df[score].idxmax(), 'preset']
|
799 |
+
preset_value = PRESET_MAP[preset]
|
800 |
+
|
801 |
+
target_family = TARGET_FAMILY_MAP[family.title()]
|
802 |
+
cfg = hydra.compose(
|
803 |
+
config_name="webserver_inference",
|
804 |
+
overrides=[f"task={task_value}",
|
805 |
+
f"preset={preset_value}",
|
806 |
+
f"ckpt_path=resources/checkpoints/{preset_value}-{task_value}-{target_family}.ckpt",
|
807 |
+
f"data.data_file='{str(predict_subset_filepath)}'"])
|
808 |
+
|
809 |
+
predictions, _ = predict(cfg)
|
810 |
+
predictions = pd.concat([pd.DataFrame(prediction) for prediction in predictions], ignore_index=True)
|
811 |
+
predictions['Prediction Source'] = f'{preset} ({family})'
|
812 |
+
prediction_df = pd.concat([prediction_df, predictions])
|
813 |
+
|
814 |
+
prediction_df = prediction_df.merge(orig_df, on=['X1', 'X2'], how='left', indicator=False)
|
815 |
+
prediction_df = pd.concat([prediction_df, annotated_df], ignore_index=True)
|
816 |
+
|
817 |
+
# prediction_df['Max. Tanimoto Similarity'] = prediction_df.groupby('Target Family')['X1'].apply(
|
818 |
+
# lambda group: group.parallel_apply(
|
819 |
+
# max_tanimoto_similarity,
|
820 |
+
# seen_smiles=tuple(get_seen_smiles(family=group.name, task=task_value))
|
821 |
+
# )
|
822 |
+
# ).values
|
823 |
+
#
|
824 |
+
# prediction_df['Max. Sequence Identity'] = prediction_df.groupby('Target Family')['X2'].apply(
|
825 |
+
# lambda group: group.parallel_apply(
|
826 |
+
# max_sequence_identity,
|
827 |
+
# seen_fastas=tuple(get_seen_fastas(family=group.name, task=task_value))
|
828 |
+
# )
|
829 |
+
# ).values
|
830 |
+
if "Include Max. Tanimoto Similarity" in opts:
|
831 |
+
for family in prediction_df['Target Family'].unique():
|
832 |
+
prediction_df.loc[
|
833 |
+
prediction_df['Target Family'] == family, 'Max. Tanimoto Similarity'] = prediction_df.loc[
|
834 |
+
prediction_df['Target Family'] == family, 'X1'].parallel_apply(
|
835 |
+
max_tanimoto_similarity,
|
836 |
+
seen_smiles=tuple(get_seen_smiles(family=family, task=task_value))
|
837 |
+
)
|
838 |
|
839 |
+
if "Include Max. Sequence Identity" in opts:
|
840 |
+
for family in prediction_df['Target Family'].unique():
|
841 |
+
prediction_df.loc[
|
842 |
+
prediction_df['Target Family'] == family, 'Max. Sequence Identity'] = prediction_df.loc[
|
843 |
+
prediction_df['Target Family'] == family, 'X2'].parallel_apply(
|
844 |
+
max_sequence_identity,
|
845 |
+
seen_fastas=tuple(get_seen_fastas(family=family, task=task_value))
|
846 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
847 |
|
848 |
+
prediction_df.drop(['N'], axis=1).to_csv(predictions_file, index=False, na_rep='')
|
849 |
status = "COMPLETED"
|
850 |
|
851 |
return {run_state: False}
|
|
|
885 |
task = 'Compound-Protein Binding Affinity'
|
886 |
|
887 |
df = pd.read_csv(file)
|
888 |
+
|
889 |
if 'N' in df.columns:
|
890 |
df.set_index('N', inplace=True)
|
891 |
+
|
892 |
if not any(col in ['X1', 'X2'] for col in df.columns):
|
893 |
gr.Warning("At least one of columns `X1` and `X2` must be in the uploaded dataset.")
|
894 |
return {analyze_btn: gr.Button(interactive=False)}
|
895 |
+
|
896 |
if 'X1' in df.columns:
|
|
|
|
|
|
|
|
|
897 |
if 'Compound' not in df.columns or df['Compound'].dtype != 'object':
|
898 |
df['Compound'] = df['X1'].parallel_apply(
|
899 |
lambda smiles: PandasTools._MolPlusFingerprint(Chem.MolFromSmiles(smiles)))
|
900 |
+
df['Scaffold'] = df['Compound'].parallel_apply(MurckoScaffold.GetScaffoldForMol)
|
901 |
+
df['Scaffold SMILES'] = df['Scaffold'].parallel_apply(lambda x: Chem.MolToSmiles(x))
|
902 |
+
|
903 |
|
904 |
# DF_FOR_REPORT = df.copy()
|
905 |
|
|
|
925 |
return {analyze_btn: gr.Button(interactive=False)}
|
926 |
|
927 |
|
928 |
+
def create_html_report(df, file=None, task=None, opts=(), progress=gr.Progress(track_tqdm=True)):
|
929 |
df_html = df.copy(deep=True)
|
930 |
column_aliases = COLUMN_ALIASES.copy()
|
931 |
cols_left = list(pd.Index(
|
|
|
936 |
if isinstance(task, str):
|
937 |
column_aliases.update({
|
938 |
'Y': 'Actual Interaction Probability' if task == 'Compound-Protein Interaction'
|
939 |
+
else 'Actual Binding Affinity pIC50 [nM]',
|
940 |
'Y^': 'Predicted Interaction Probability' if task == 'Compound-Protein Interaction'
|
941 |
+
else 'Predicted Binding Affinity (pIC50 [nM])'
|
942 |
})
|
943 |
|
944 |
ascending = True if column_aliases['Y^'] == 'Predicted Binding Affinity' else False
|
|
|
976 |
|
977 |
elif 'Y^' in df_html.columns:
|
978 |
job = 'Interaction Pair Inference'
|
979 |
+
if 'Compound' in df_html.columns and 'Exclude Molecular Graph' not in opts:
|
980 |
df_html['Compound'] = df_html['Compound'].parallel_apply(
|
981 |
lambda x: PandasTools.PrintAsImageString(x) if not pd.isna(x) else x)
|
982 |
+
else:
|
983 |
+
df_html.drop(['Compound'], axis=1, inplace=True)
|
984 |
+
|
985 |
+
if 'Scaffold' in df_html.columns and 'Exclude Scaffold Graph' not in opts:
|
986 |
df_html['Scaffold'] = df_html['Scaffold'].parallel_apply(
|
987 |
lambda x: PandasTools.PrintAsImageString(x) if not pd.isna(x) else x)
|
988 |
+
else:
|
989 |
+
df_html.drop(['Scaffold'], axis=1, inplace=True)
|
990 |
|
991 |
df_html.rename(columns=column_aliases, inplace=True)
|
992 |
df_html.index.name = 'Index'
|
|
|
1454 |
"Interaction prediction provides you binding probability score between the target of "
|
1455 |
"interest and each compound in the library, "
|
1456 |
"while affinity prediction directly estimates their binding strength measured using "
|
1457 |
+
"pIC<sub>50</sub> in units of nM."
|
1458 |
)
|
1459 |
drug_screen_task = gr.Dropdown(
|
1460 |
list(TASK_MAP.keys()),
|
|
|
1491 |
drug_library_upload_btn = gr.UploadButton(
|
1492 |
label='OR Upload Your Own Library', variant='primary')
|
1493 |
drug_library_upload = gr.File(label='Custom compound library file', visible=False)
|
1494 |
+
with gr.Column():
|
1495 |
+
drug_screen_opts = gr.CheckboxGroup(
|
1496 |
+
['Include Max. Tanimoto Similarity'],
|
1497 |
+
label='Step 6. Select Additional Options',
|
1498 |
+
info="Calculating the maximum Tanimoto similarity of the library compounds to the "
|
1499 |
+
"training dataset is an experimental feature and may take a considerable amount of time."
|
1500 |
+
)
|
1501 |
with gr.Row():
|
1502 |
with gr.Column():
|
1503 |
drug_screen_email = gr.Textbox(
|
1504 |
+
label='Step 7. Input Your Email Address (Optional)',
|
1505 |
info="Your email address will be used to notify you of the status of your job. "
|
1506 |
"If you cannot receive the email, please check your spam/junk folder."
|
1507 |
)
|
1508 |
|
1509 |
with gr.Row(visible=True):
|
1510 |
with gr.Column():
|
1511 |
+
drug_screen_clr_btn = gr.ClearButton(size='lg')
|
1512 |
drug_screen_btn = gr.Button(value='SUBMIT THE SCREENING JOB', variant='primary', size='lg')
|
1513 |
# TODO Modify the pd df directly with df['X2'] = target
|
1514 |
|
|
|
1544 |
example_drug = gr.Button(value='Example: Aspirin', elem_classes='example')
|
1545 |
|
1546 |
with gr.Row():
|
1547 |
+
with gr.Column(visible=True):
|
1548 |
HelpTip(
|
1549 |
"By default, models trained on all protein families (general) will be applied. "
|
1550 |
+
"If you upload a target library containing proteins all in the same family, "
|
1551 |
+
"you may manually select a Target Family."
|
1552 |
)
|
1553 |
target_identify_target_family = gr.Dropdown(
|
1554 |
+
choices=['Family-Specific Auto-Recommendation'] + list(TARGET_FAMILY_MAP.keys()),
|
1555 |
+
value='General',
|
1556 |
+
label='Step 2. Select Target Family')
|
|
|
1557 |
with gr.Column():
|
1558 |
HelpTip(
|
1559 |
"Interaction prediction provides you binding probability score between the target of "
|
1560 |
"interest and each compound in the library, while affinity prediction directly "
|
1561 |
+
"estimates their binding strength measured using pIC<sub>50</sub> in units of nM."
|
1562 |
)
|
1563 |
target_identify_task = gr.Dropdown(
|
1564 |
list(TASK_MAP.keys()),
|
1565 |
+
label='Step 3. Select a Prediction Task',
|
1566 |
value='Compound-Protein Interaction')
|
1567 |
|
1568 |
with gr.Column():
|
|
|
1573 |
"Please refer to the documentation for detailed benchmark results."
|
1574 |
)
|
1575 |
target_identify_preset = gr.Dropdown(
|
1576 |
+
['Family-Specific Auto-Recommendation'] + list(PRESET_MAP.keys()),
|
1577 |
+
label='Step 4. Select a Preset Model')
|
1578 |
identify_preset_recommend_btn = gr.Button(value='OR Let Us Recommend for You',
|
1579 |
variant='primary')
|
1580 |
with gr.Row():
|
|
|
1587 |
"and can be downloaded by clicking the lower right corner."
|
1588 |
)
|
1589 |
target_library = gr.Dropdown(
|
1590 |
+
label='Step 5. Select a Preset Target Library',
|
1591 |
choices=list(TARGET_LIBRARY_MAP.keys()))
|
1592 |
with gr.Row():
|
1593 |
gr.File(label='Example FASTA target library',
|
|
|
1598 |
label='OR Upload Your Own Library', variant='primary')
|
1599 |
target_library_upload = gr.File(label='Custom target library file', visible=False)
|
1600 |
|
1601 |
+
with gr.Column():
|
1602 |
+
target_identify_opts = gr.CheckboxGroup(
|
1603 |
+
['Include Max. Sequence Identity'],
|
1604 |
+
label='Step 6. Select Additional Options',
|
1605 |
+
info="Calculating the maximum sequence identity of the library protein to the "
|
1606 |
+
"training dataset is an experimental feature and may take a considerable amount of time."
|
1607 |
+
)
|
1608 |
with gr.Row():
|
1609 |
with gr.Column():
|
1610 |
target_identify_email = gr.Textbox(
|
1611 |
+
label='Step 7. Input Your Email Address (Optional)',
|
1612 |
info="Your email address will be used to notify you of the status of your job. "
|
1613 |
"If you cannot receive the email, please check your spam/junk folder."
|
1614 |
)
|
1615 |
|
1616 |
with gr.Row(visible=True):
|
1617 |
+
target_identify_clr_btn = gr.ClearButton(size='lg')
|
1618 |
target_identify_btn = gr.Button(value='SUBMIT THE IDENTIFICATION JOB', variant='primary',
|
1619 |
size='lg')
|
1620 |
|
|
|
1692 |
"Interaction prediction provides you binding probability score "
|
1693 |
"between the target of interest and each compound in the library, "
|
1694 |
"while affinity prediction directly estimates their binding strength "
|
1695 |
+
"measured using pIC<sub>50</sub> in units of nM."
|
1696 |
)
|
1697 |
pair_infer_task = gr.Dropdown(
|
1698 |
list(TASK_MAP.keys()),
|
|
|
1716 |
"If you cannot receive the email, please check your spam/junk folder.")
|
1717 |
|
1718 |
with gr.Row(visible=True):
|
1719 |
+
pair_infer_clr_btn = gr.ClearButton(size='lg')
|
1720 |
pair_infer_btn = gr.Button(value='SUBMIT THE INFERENCE JOB', variant='primary', size='lg')
|
1721 |
|
1722 |
infer_data_for_predict = gr.File(file_count="single", type='filepath', visible=False)
|
|
|
1737 |
Please first `Preview` the report, then `Generate` and download a CSV report
|
1738 |
or an interactive HTML report below if you wish to access the full report.
|
1739 |
''')
|
1740 |
+
raw_df = gr.State(value=pd.DataFrame())
|
1741 |
+
report_df = gr.State(value=pd.DataFrame())
|
1742 |
with gr.Row():
|
1743 |
+
with gr.Column(scale=1):
|
1744 |
file_for_report = gr.File(interactive=True, type='filepath')
|
1745 |
report_task = gr.Dropdown(list(TASK_MAP.keys()), visible=False, value=None,
|
1746 |
+
label='Specify the Task Labels in the Uploaded Dataset')
|
1747 |
+
with gr.Column(scale=2):
|
1748 |
+
with gr.Row():
|
1749 |
+
scores = gr.CheckboxGroup(list(SCORE_MAP.keys()), label='Compound Scores')
|
1750 |
+
filters = gr.CheckboxGroup(list(FILTER_MAP.keys()), label='Compound Filters')
|
1751 |
+
with gr.Accordion('Report Generate Options', open=False):
|
1752 |
+
with gr.Row():
|
1753 |
+
csv_sep = gr.Radio(label='CSV Delimiter',
|
1754 |
+
choices=['Comma', 'Tab'], value='Comma')
|
1755 |
+
html_opts = gr.CheckboxGroup(label='HTML Report Options',
|
1756 |
+
choices=['Exclude Molecular Graph', 'Exclude Scaffold Graph'])
|
1757 |
|
1758 |
with gr.Row():
|
1759 |
+
report_clr_btn = gr.ClearButton(size='lg')
|
1760 |
+
analyze_btn = gr.Button('Calculate Properties and Preview', variant='primary',
|
1761 |
+
size='lg', interactive=False)
|
1762 |
|
1763 |
with gr.Row():
|
1764 |
with gr.Column(scale=3):
|
1765 |
html_report = gr.HTML() # label='Results', visible=True)
|
1766 |
+
ranking_pie_chart = gr.Plot(visible=False)
|
1767 |
|
1768 |
with gr.Row():
|
1769 |
with gr.Column():
|
|
|
1783 |
if the job has completed. Note that predictions are only kept for 48 hours upon job completion.
|
1784 |
|
1785 |
You will be redirected to Chemical Property Report for carrying out further analysis and
|
1786 |
+
generating the full report when the job is done. If the Lookup fails to respond, please wait for a
|
1787 |
+
few minutes and refresh the page to try again.
|
1788 |
''')
|
1789 |
with gr.Column():
|
1790 |
pred_lookup_id = gr.Textbox(
|
|
|
1888 |
|
1889 |
def target_family_detect(fasta, progress=gr.Progress(track_tqdm=True)):
|
1890 |
try:
|
1891 |
+
aligner = PairwiseAligner(mode='local')
|
1892 |
+
alignment_df = get_fasta_family_map()
|
1893 |
|
1894 |
processed_fasta = process_target_fasta(fasta)
|
1895 |
|
|
|
1897 |
exact_match = alignment_df[alignment_df['X2'] == processed_fasta]
|
1898 |
if not exact_match.empty:
|
1899 |
row = exact_match.iloc[0]
|
1900 |
+
return gr.Dropdown(
|
1901 |
+
value=row['Target Family'],
|
1902 |
+
info=f"Reason: Exact match found with {row['ID2']} from family {row['Target Family']}")
|
1903 |
|
1904 |
# If no exact match, then calculate alignment score
|
1905 |
def align_score(query):
|
1906 |
+
alignment = aligner.align(processed_fasta, query)
|
1907 |
+
return alignment.score / max(len(processed_fasta), len(query))
|
1908 |
|
1909 |
alignment_df['score'] = alignment_df['X2'].parallel_apply(align_score)
|
1910 |
row = alignment_df.loc[alignment_df['score'].idxmax()]
|
1911 |
+
return gr.Dropdown(value=row['Target Family'],
|
1912 |
+
info=f"Reason: Best sequence identity ({row['score']}) "
|
1913 |
+
f"with {row['ID2']} from family {row['Target Family']}")
|
1914 |
except Exception as e:
|
1915 |
gr.Warning("Failed to detect the protein family due to error: " + str(e))
|
1916 |
|
|
|
1973 |
scenario_general = "Unseen Target"
|
1974 |
|
1975 |
seen_targets_family = pd.read_csv(
|
1976 |
+
f'data/benchmarks/seen_targets/{TARGET_FAMILY_MAP[family.title()]}_{task.lower()}_random_split.csv')
|
1977 |
if process_target_fasta(fasta) in seen_targets_family['X2'].values:
|
1978 |
scenario_family = "Seen Target"
|
1979 |
else:
|
|
|
1988 |
filtered_df = pd.concat([filtered_df_general, filtered_df_family])
|
1989 |
|
1990 |
row = filtered_df.loc[filtered_df[score].idxmax()]
|
1991 |
+
if row['Scenario'] == 'Seen Target':
|
1992 |
+
scenario = "Seen Target (>=0.85 sequence identity)"
|
1993 |
+
elif row['Scenario'] == 'Unseen Target':
|
1994 |
+
scenario = "Unseen Target (<0.85 sequence identity)"
|
1995 |
|
1996 |
return {drug_screen_preset:
|
1997 |
gr.Dropdown(value=row['Model'],
|
1998 |
info=f"Reason: {row['Scenario']} in training; we recommend the {row['Type']}-trained "
|
1999 |
+
f"model with the best {score} in the {scenario} scenario "
|
2000 |
+
f"on {row['Family']}."),
|
2001 |
drug_screen_target_family:
|
2002 |
gr.Dropdown(value='General') if row['Type'] == 'General' else gr.Dropdown(value=family)}
|
2003 |
|
|
|
2053 |
gr.Warning('Please enter a valid SMILES for model recommendation.')
|
2054 |
return None
|
2055 |
|
2056 |
+
seen_compounds = pd.read_csv(
|
2057 |
+
f'data/benchmarks/seen_compounds/all_families_full_{task.lower()}_random_split.csv')
|
2058 |
+
if rdkit_canonicalize(smiles) in seen_compounds['X1'].values:
|
2059 |
scenario = "Seen Compound"
|
2060 |
else:
|
2061 |
scenario = "Unseen Compound"
|
|
|
2068 |
|
2069 |
return gr.Dropdown(value=row['Model'],
|
2070 |
info=f"Reason: {scenario} in training; choosing the model "
|
2071 |
+
f"with the best {score} in the {scenario} scenario.")
|
|
|
2072 |
|
2073 |
|
2074 |
identify_preset_recommend_btn.click(fn=identify_recommend_model,
|
|
|
2169 |
|
2170 |
job_id = str(uuid4())
|
2171 |
temp_file = Path(f'{SERVER_DATA_DIR}/{job_id}_input.csv').resolve()
|
2172 |
+
screen_df.to_csv(temp_file, index=False, na_rep='')
|
2173 |
if temp_file.is_file():
|
2174 |
job_info = common_job_initiate(job_id, 'Drug Hit Screening', email, request, task)
|
2175 |
return {screen_data_for_predict: str(temp_file),
|
|
|
2199 |
|
2200 |
job_id = str(uuid4())
|
2201 |
temp_file = Path(f'{SERVER_DATA_DIR}/{job_id}_input.csv').resolve()
|
2202 |
+
identify_df.to_csv(temp_file, index=False, na_rep='')
|
2203 |
if temp_file.is_file():
|
2204 |
job_info = common_job_initiate(job_id, 'Target Protein Identification', email, request, task)
|
2205 |
return {identify_data_for_predict: str(temp_file),
|
|
|
2247 |
f'than the allowed maximum {DATASET_MAX_LEN}.')
|
2248 |
|
2249 |
temp_file = Path(f'{SERVER_DATA_DIR}/{job_id}_input.csv').resolve()
|
2250 |
+
infer_df.to_csv(temp_file, index=False, na_rep='')
|
2251 |
|
2252 |
else:
|
2253 |
raise gr.Error('Should upload a compound-protein pair dataset, or '
|
|
|
2297 |
drug_screen_click.success(
|
2298 |
fn=submit_predict,
|
2299 |
inputs=[screen_data_for_predict, drug_screen_task, drug_screen_preset,
|
2300 |
+
drug_screen_target_family, drug_screen_opts, run_state, ],
|
2301 |
outputs=[run_state, ]
|
2302 |
)
|
2303 |
|
2304 |
+
drug_screen_clr_btn.click(
|
2305 |
+
lambda: ['General'] + [None] * 5,
|
2306 |
+
outputs=[drug_screen_target_family,
|
2307 |
+
target_fasta, drug_screen_preset, drug_library, drug_library_upload, drug_screen_email])
|
2308 |
+
|
2309 |
+
target_identify_clr_btn.click(
|
2310 |
+
lambda: ['General'] + [None] * 5,
|
2311 |
+
outputs=[target_identify_target_family,
|
2312 |
+
compound_smiles, target_identify_preset, target_library, target_library_upload, target_identify_email])
|
2313 |
+
|
2314 |
+
pair_infer_clr_btn.click(
|
2315 |
+
lambda: ['General'] + [None] * 4,
|
2316 |
+
outputs=[pair_infer_target_family,
|
2317 |
+
infer_pair, infer_drug, infer_target, pair_infer_preset, pair_infer_email])
|
2318 |
+
|
2319 |
+
report_clr_btn.click(
|
2320 |
+
lambda: ['General'] + [None] * 4,
|
2321 |
+
outputs=[scores,
|
2322 |
+
target_fasta, drug_screen_preset, drug_library, drug_library_upload, drug_screen_email])
|
2323 |
+
|
2324 |
+
|
2325 |
+
def update_preset(family, preset):
|
2326 |
+
if family == 'Family-Specific Auto-Recommendation':
|
2327 |
+
return 'Family-Specific Auto-Recommendation'
|
2328 |
+
elif preset == 'Family-Specific Auto-Recommendation':
|
2329 |
+
return None
|
2330 |
+
else:
|
2331 |
+
return preset
|
2332 |
+
|
2333 |
+
def update_family(family, preset):
|
2334 |
+
if preset == 'Family-Specific Auto-Recommendation':
|
2335 |
+
return 'Family-Specific Auto-Recommendation'
|
2336 |
+
elif family == 'Family-Specific Auto-Recommendation':
|
2337 |
+
return None
|
2338 |
+
else:
|
2339 |
+
return family
|
2340 |
+
|
2341 |
+
target_identify_target_family.change(
|
2342 |
+
fn=update_preset, inputs=[target_identify_target_family, target_identify_preset],
|
2343 |
+
outputs=target_identify_preset, show_progress='hidden')
|
2344 |
+
target_identify_preset.change(
|
2345 |
+
fn=update_family, inputs=[target_identify_target_family, target_identify_preset],
|
2346 |
+
outputs=target_identify_target_family, show_progress='hidden')
|
2347 |
+
|
2348 |
target_identify_click = target_identify_btn.click(
|
2349 |
fn=target_identify_validate,
|
2350 |
inputs=[compound_smiles, target_library, target_library_upload, target_identify_preset, target_identify_task,
|
|
|
2373 |
target_identify_click.success(
|
2374 |
fn=submit_predict,
|
2375 |
inputs=[identify_data_for_predict, target_identify_task, target_identify_preset,
|
2376 |
+
target_identify_target_family, target_identify_opts, run_state, ], # , target_identify_email],
|
2377 |
outputs=[run_state, ]
|
2378 |
)
|
2379 |
|
|
|
2448 |
report_df_change = file_for_report.change(
|
2449 |
fn=update_df, inputs=file_for_report, outputs=[html_report, raw_df, report_df, analyze_btn, report_task],
|
2450 |
concurrency_limit=100,
|
2451 |
+
).then(
|
2452 |
+
fn=lambda: [gr.Button(interactive=True)] * 2,
|
2453 |
+
outputs=[csv_generate, html_generate],
|
2454 |
)
|
2455 |
|
2456 |
file_for_report.upload(
|
|
|
2465 |
file_for_report.clear(
|
2466 |
fn=lambda: [gr.Button(interactive=False)] * 3 +
|
2467 |
[gr.File(visible=False, value=None)] * 2 +
|
2468 |
+
[gr.Dropdown(visible=False, value=None), gr.HTML(visible=False)],
|
2469 |
+
cancels=[report_df_change],
|
2470 |
outputs=[
|
2471 |
csv_generate, html_generate, analyze_btn, csv_download_file, html_download_file, report_task, html_report
|
2472 |
]
|
|
|
2485 |
outputs=analyze_btn)
|
2486 |
|
2487 |
|
2488 |
+
def create_csv_report_file(df, file_report, task, sep, progress=gr.Progress(track_tqdm=True)):
|
2489 |
+
csv_sep_map = {
|
2490 |
+
'Comma': ',',
|
2491 |
+
'Tab': '\t',
|
2492 |
+
}
|
2493 |
+
Y_colname = 'Y^'
|
2494 |
+
if isinstance(task, str):
|
2495 |
+
if task == 'Compound-Protein Interaction':
|
2496 |
+
Y_colname = 'Y^_pIC50',
|
2497 |
+
elif task == 'Compound-Protein Binding Affinity':
|
2498 |
+
Y_colname = 'Y^_prob'
|
2499 |
try:
|
2500 |
now = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
|
2501 |
+
filename = f"{SERVER_DATA_DIR}/{Path(file_report.name).stem}_DeepSEQreen_report_{now}.csv"
|
2502 |
+
df.rename(columns={'Y^': Y_colname}).drop(
|
2503 |
+
labels=['Compound', 'Scaffold'], axis=1
|
2504 |
+
).to_csv(filename, index=False, na_rep='', sep=csv_sep_map[sep])
|
2505 |
|
2506 |
return gr.File(filename)
|
2507 |
except Exception as e:
|
|
|
2509 |
return None
|
2510 |
|
2511 |
|
2512 |
+
def create_html_report_file(df, file_report, task, opts, progress=gr.Progress(track_tqdm=True)):
|
2513 |
try:
|
2514 |
now = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
|
2515 |
+
filename = f"{SERVER_DATA_DIR}/{Path(file_report.name).stem}_DeepSEQreen_report_{now}.html"
|
2516 |
+
create_html_report(df, filename, task, opts)
|
2517 |
return gr.File(filename, visible=True)
|
2518 |
except Exception as e:
|
2519 |
gr.Warning(f"Failed to generate HTML due to error: {str(e)}")
|
2520 |
return None
|
2521 |
|
2522 |
|
2523 |
+
# html_report.change(lambda: [gr.Button(visible=True)] * 2, outputs=[csv_generate, html_generate])
|
2524 |
+
|
2525 |
csv_generate.click(
|
2526 |
+
lambda: [gr.File(visible=True)], outputs=[csv_download_file],
|
2527 |
+
).then(fn=create_csv_report_file, inputs=[report_df, file_for_report, report_task, csv_sep],
|
2528 |
outputs=csv_download_file, show_progress='full')
|
2529 |
html_generate.click(
|
2530 |
+
lambda: [gr.File(visible=True)], outputs=[html_download_file],
|
2531 |
+
).then(fn=create_html_report_file, inputs=[report_df, file_for_report, report_task, html_opts],
|
2532 |
outputs=html_download_file, show_progress='full')
|
2533 |
|
2534 |
+
|
2535 |
if __name__ == "__main__":
|
2536 |
+
pandarallel.initialize()
|
2537 |
hydra.initialize(version_base="1.3", config_path="configs", job_name="webserver_inference")
|
|
|
2538 |
demo.queue(default_concurrency_limit=None, max_size=10).launch(show_api=False)
|
2539 |
+
scheduler.add_job(check_expiry, 'interval', hours=1)
|
2540 |
+
scheduler.start()
|