Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Update app.py
Browse files
app.py
CHANGED
@@ -43,7 +43,7 @@ import panel as pn
|
|
43 |
from apscheduler.schedulers.background import BackgroundScheduler
|
44 |
from tinydb import TinyDB, Query
|
45 |
|
46 |
-
import swifter
|
47 |
from tqdm.auto import tqdm
|
48 |
|
49 |
from deepscreen.data.dti import validate_seq_str, rdkit_canonicalize, FASTA_PAT, SMILES_PAT
|
@@ -786,7 +786,7 @@ def submit_predict(predict_filepath, task, preset, target_family, opts, job_info
|
|
786 |
orig_df['Target Family'] = None
|
787 |
if orig_df['Target Family'].isna().any():
|
788 |
orig_df.loc[orig_df['Target Family'].isna(), 'Target Family'] = (
|
789 |
-
orig_df.loc[orig_df['Target Family'].isna(), 'X2'].
|
790 |
)
|
791 |
detect_family.cache_clear()
|
792 |
|
@@ -885,7 +885,7 @@ def submit_predict(predict_filepath, task, preset, target_family, opts, job_info
|
|
885 |
if "Include Compound Max. Tanimoto Similarity to Training Compounds" in opts:
|
886 |
for family in prediction_df['Target Family'].unique():
|
887 |
family_smiles_df = get_seen_smiles(family=family, task=task_value)
|
888 |
-
family_smiles_df['FP'] = family_smiles_df['X1'].
|
889 |
|
890 |
@cache
|
891 |
def max_sim(smi):
|
@@ -893,7 +893,7 @@ def submit_predict(predict_filepath, task, preset, target_family, opts, job_info
|
|
893 |
|
894 |
prediction_df.loc[
|
895 |
prediction_df['Target Family'] == family, 'Max. Tanimoto Similarity to Training Compounds'] = (
|
896 |
-
prediction_df.loc[prediction_df['Target Family'] == family, 'X1'].
|
897 |
)
|
898 |
max_sim.cache_clear()
|
899 |
|
@@ -907,13 +907,13 @@ def submit_predict(predict_filepath, task, preset, target_family, opts, job_info
|
|
907 |
return max_tanimoto_similarity(smiles, seen_smiles_with_fp=pos_compounds_df)
|
908 |
|
909 |
prediction_df[['Max. Tanimoto Similarity', 'Max. Tanimoto Similarity Compound']] = (
|
910 |
-
prediction_df['X1'].
|
911 |
)
|
912 |
max_sim.cache_clear()
|
913 |
|
914 |
if "Include Target Max. Sequence Identity to Known Interacting Targets of Compound" in opts:
|
915 |
x2 = prediction_df['X2'].iloc[0]
|
916 |
-
prediction_df['X1^'] = prediction_df['X1'].
|
917 |
|
918 |
@cache
|
919 |
def calculate_max_sequence_identity(compound):
|
@@ -921,7 +921,7 @@ def submit_predict(predict_filepath, task, preset, target_family, opts, job_info
|
|
921 |
return max_sequence_identity(x2, seen_fastas=compound_targets)
|
922 |
|
923 |
prediction_df[['Max. Sequence Identity', 'Max. Sequence Identity Target']] = (
|
924 |
-
prediction_df['X1^'].
|
925 |
)
|
926 |
prediction_df.drop(['X1^'], axis=1, inplace=True)
|
927 |
|
@@ -937,7 +937,7 @@ def submit_predict(predict_filepath, task, preset, target_family, opts, job_info
|
|
937 |
|
938 |
prediction_df.loc[
|
939 |
prediction_df['Target Family'] == family, 'Max. Sequence Identity to Training Targets'] = (
|
940 |
-
prediction_df.loc[prediction_df['Target Family'] == family, 'X2'].
|
941 |
)
|
942 |
max_id.cache_clear()
|
943 |
|
@@ -991,10 +991,10 @@ def update_df(file, progress=gr.Progress(track_tqdm=True)):
|
|
991 |
|
992 |
if 'X1' in df.columns:
|
993 |
if 'Compound' not in df.columns or df['Compound'].dtype != 'object':
|
994 |
-
df['Compound'] = df['X1'].
|
995 |
lambda smiles: PandasTools._MolPlusFingerprint(Chem.MolFromSmiles(smiles)))
|
996 |
-
df['Scaffold'] = df['Compound'].
|
997 |
-
df['Scaffold SMILES'] = df['Scaffold'].
|
998 |
|
999 |
if task == 'Compound-Protein Binding Affinity':
|
1000 |
# Convert Y^ from pIC50 to IC50
|
@@ -1040,13 +1040,13 @@ def create_html_report(df, file=None, task=None, opts=(), progress=gr.Progress(t
|
|
1040 |
columns_unique = None
|
1041 |
|
1042 |
if 'Compound' in df_html.columns and 'Exclude Molecular Graph' not in opts:
|
1043 |
-
df_html['Compound'] = df_html['Compound'].
|
1044 |
lambda x: PandasTools.PrintAsImageString(x) if not pd.isna(x) else x)
|
1045 |
else:
|
1046 |
df_html.drop(['Compound'], axis=1, inplace=True)
|
1047 |
|
1048 |
if 'Scaffold' in df_html.columns and 'Exclude Scaffold Graph' not in opts:
|
1049 |
-
df_html['Scaffold'] = df_html['Scaffold'].
|
1050 |
lambda x: PandasTools.PrintAsImageString(x) if not pd.isna(x) else x)
|
1051 |
else:
|
1052 |
df_html.drop(['Scaffold'], axis=1, inplace=True)
|
@@ -1076,7 +1076,7 @@ def create_html_report(df, file=None, task=None, opts=(), progress=gr.Progress(t
|
|
1076 |
df_html.rename(columns=column_aliases, inplace=True)
|
1077 |
df_html.index.name = 'Index'
|
1078 |
if 'Target FASTA' in df_html.columns:
|
1079 |
-
df_html['Target FASTA'] = df_html['Target FASTA'].
|
1080 |
lambda x: wrap_text(x) if not pd.isna(x) else x)
|
1081 |
|
1082 |
num_cols = df_html.select_dtypes('number').columns
|
@@ -1094,7 +1094,7 @@ def create_html_report(df, file=None, task=None, opts=(), progress=gr.Progress(t
|
|
1094 |
if 'Target ID' in df_html.columns:
|
1095 |
df_html.drop(['Target FASTA'], axis=1, inplace=True)
|
1096 |
if 'Target FASTA' in df_html.columns:
|
1097 |
-
df_html['Target FASTA'] = df_html['Target FASTA'].
|
1098 |
lambda x: wrap_text(x) if not pd.isna(x) else x)
|
1099 |
if 'Scaffold SMILES' in df_html.columns:
|
1100 |
df_html.drop(['Scaffold SMILES'], axis=1, inplace=True)
|
@@ -1159,7 +1159,9 @@ def create_html_report(df, file=None, task=None, opts=(), progress=gr.Progress(t
|
|
1159 |
|
1160 |
report_table = pn.widgets.Tabulator(
|
1161 |
df_html, formatters=formatters,
|
1162 |
-
frozen_columns=[
|
|
|
|
|
1163 |
disabled=True, sizing_mode='stretch_both', pagination='local', page_size=30)
|
1164 |
|
1165 |
for i, col in enumerate(num_cols):
|
@@ -1314,7 +1316,7 @@ def create_pie_chart(df, category, value, top_k):
|
|
1314 |
("Percentage", "@proportion{0.0%}")
|
1315 |
]
|
1316 |
|
1317 |
-
if category == 'Scaffold SMILES':
|
1318 |
data = data.merge(top_k_df[['Scaffold SMILES', 'Scaffold']].drop_duplicates(), how='left',
|
1319 |
left_on='Scaffold SMILES', right_on='Scaffold SMILES')
|
1320 |
tooltips.append(("Scaffold", "<div>@{Scaffold}{safe}</div>"))
|
@@ -1353,11 +1355,11 @@ def submit_report(df, score_list, filter_list, task, progress=gr.Progress(track_
|
|
1353 |
df_report = df.copy()
|
1354 |
try:
|
1355 |
for filter_name in filter_list:
|
1356 |
-
df_report[filter_name] = df_report['Compound'].
|
1357 |
lambda x: FILTER_MAP[filter_name](x) if not pd.isna(x) else x)
|
1358 |
|
1359 |
for score_name in score_list:
|
1360 |
-
df_report[score_name] = df_report['Compound'].
|
1361 |
lambda x: SCORE_MAP[score_name](x) if not pd.isna(x) else x)
|
1362 |
|
1363 |
return (create_html_report(df_report, file=None, task=task), df_report,
|
@@ -1990,7 +1992,7 @@ with gr.Blocks(theme=theme, title='DeepSEQreen', css=CSS, delete_cache=(3600, 48
|
|
1990 |
alignment = aligner.align(processed_fasta, query)
|
1991 |
return alignment.score / max(len(processed_fasta), len(query))
|
1992 |
|
1993 |
-
alignment_df['score'] = alignment_df['X2'].
|
1994 |
row = alignment_df.loc[alignment_df['score'].idxmax()]
|
1995 |
family = str(row['Target Family']).title()
|
1996 |
return gr.Dropdown(value=family,
|
@@ -2316,13 +2318,13 @@ QALAHAYFAQYHDPDDEPVADPYDQSFESRDLLIDEWKSLTYDEVISFVPPPLDQEEMES
|
|
2316 |
infer_df = pd.read_csv(drug_target_pair_upload)
|
2317 |
validate_columns(infer_df, ['X1', 'X2'])
|
2318 |
|
2319 |
-
infer_df['X1_ERR'] = infer_df['X1'].
|
2320 |
validate_seq_str, regex=SMILES_PAT)
|
2321 |
if not infer_df['X1_ERR'].isna().all():
|
2322 |
raise ValueError(
|
2323 |
f"Encountered invalid SMILES:\n{infer_df[~infer_df['X1_ERR'].isna()][['X1', 'X1_ERR']]}")
|
2324 |
|
2325 |
-
infer_df['X2_ERR'] = infer_df['X2'].
|
2326 |
validate_seq_str, regex=FASTA_PAT)
|
2327 |
if not infer_df['X2_ERR'].isna().all():
|
2328 |
raise ValueError(
|
@@ -2546,6 +2548,8 @@ QALAHAYFAQYHDPDDEPVADPYDQSFESRDLLIDEWKSLTYDEVISFVPPPLDQEEMES
|
|
2546 |
info=f'Found {label} in your uploaded dataset. '
|
2547 |
'Is it compound-protein interaction or binding affinity?'),
|
2548 |
html_report: ''}
|
|
|
|
|
2549 |
|
2550 |
|
2551 |
report_df_change = file_for_report.change(
|
@@ -2562,7 +2566,7 @@ QALAHAYFAQYHDPDDEPVADPYDQSFESRDLLIDEWKSLTYDEVISFVPPPLDQEEMES
|
|
2562 |
concurrency_limit=100,
|
2563 |
).success(
|
2564 |
fn=inquire_task, inputs=[raw_df],
|
2565 |
-
outputs=[report_task, html_report
|
2566 |
)
|
2567 |
|
2568 |
file_for_report.clear(
|
|
|
43 |
from apscheduler.schedulers.background import BackgroundScheduler
|
44 |
from tinydb import TinyDB, Query
|
45 |
|
46 |
+
# import swifter
|
47 |
from tqdm.auto import tqdm
|
48 |
|
49 |
from deepscreen.data.dti import validate_seq_str, rdkit_canonicalize, FASTA_PAT, SMILES_PAT
|
|
|
786 |
orig_df['Target Family'] = None
|
787 |
if orig_df['Target Family'].isna().any():
|
788 |
orig_df.loc[orig_df['Target Family'].isna(), 'Target Family'] = (
|
789 |
+
orig_df.loc[orig_df['Target Family'].isna(), 'X2'].parallel_apply(detect_family)
|
790 |
)
|
791 |
detect_family.cache_clear()
|
792 |
|
|
|
885 |
if "Include Compound Max. Tanimoto Similarity to Training Compounds" in opts:
|
886 |
for family in prediction_df['Target Family'].unique():
|
887 |
family_smiles_df = get_seen_smiles(family=family, task=task_value)
|
888 |
+
family_smiles_df['FP'] = family_smiles_df['X1'].parallel_apply(smiles_to_ecfp)
|
889 |
|
890 |
@cache
|
891 |
def max_sim(smi):
|
|
|
893 |
|
894 |
prediction_df.loc[
|
895 |
prediction_df['Target Family'] == family, 'Max. Tanimoto Similarity to Training Compounds'] = (
|
896 |
+
prediction_df.loc[prediction_df['Target Family'] == family, 'X1'].parallel_apply(max_sim)
|
897 |
)
|
898 |
max_sim.cache_clear()
|
899 |
|
|
|
907 |
return max_tanimoto_similarity(smiles, seen_smiles_with_fp=pos_compounds_df)
|
908 |
|
909 |
prediction_df[['Max. Tanimoto Similarity', 'Max. Tanimoto Similarity Compound']] = (
|
910 |
+
prediction_df['X1'].parallel_apply(max_sim).apply(pd.Series)
|
911 |
)
|
912 |
max_sim.cache_clear()
|
913 |
|
914 |
if "Include Target Max. Sequence Identity to Known Interacting Targets of Compound" in opts:
|
915 |
x2 = prediction_df['X2'].iloc[0]
|
916 |
+
prediction_df['X1^'] = prediction_df['X1'].parallel_apply(rdkit_canonicalize)
|
917 |
|
918 |
@cache
|
919 |
def calculate_max_sequence_identity(compound):
|
|
|
921 |
return max_sequence_identity(x2, seen_fastas=compound_targets)
|
922 |
|
923 |
prediction_df[['Max. Sequence Identity', 'Max. Sequence Identity Target']] = (
|
924 |
+
prediction_df['X1^'].parallel_apply(calculate_max_sequence_identity).apply(pd.Series)
|
925 |
)
|
926 |
prediction_df.drop(['X1^'], axis=1, inplace=True)
|
927 |
|
|
|
937 |
|
938 |
prediction_df.loc[
|
939 |
prediction_df['Target Family'] == family, 'Max. Sequence Identity to Training Targets'] = (
|
940 |
+
prediction_df.loc[prediction_df['Target Family'] == family, 'X2'].parallel_apply(max_id)
|
941 |
)
|
942 |
max_id.cache_clear()
|
943 |
|
|
|
991 |
|
992 |
if 'X1' in df.columns:
|
993 |
if 'Compound' not in df.columns or df['Compound'].dtype != 'object':
|
994 |
+
df['Compound'] = df['X1'].parallel_apply(
|
995 |
lambda smiles: PandasTools._MolPlusFingerprint(Chem.MolFromSmiles(smiles)))
|
996 |
+
df['Scaffold'] = df['Compound'].parallel_apply(MurckoScaffold.GetScaffoldForMol)
|
997 |
+
df['Scaffold SMILES'] = df['Scaffold'].parallel_apply(lambda x: Chem.MolToSmiles(x))
|
998 |
|
999 |
if task == 'Compound-Protein Binding Affinity':
|
1000 |
# Convert Y^ from pIC50 to IC50
|
|
|
1040 |
columns_unique = None
|
1041 |
|
1042 |
if 'Compound' in df_html.columns and 'Exclude Molecular Graph' not in opts:
|
1043 |
+
df_html['Compound'] = df_html['Compound'].parallel_apply(
|
1044 |
lambda x: PandasTools.PrintAsImageString(x) if not pd.isna(x) else x)
|
1045 |
else:
|
1046 |
df_html.drop(['Compound'], axis=1, inplace=True)
|
1047 |
|
1048 |
if 'Scaffold' in df_html.columns and 'Exclude Scaffold Graph' not in opts:
|
1049 |
+
df_html['Scaffold'] = df_html['Scaffold'].parallel_apply(
|
1050 |
lambda x: PandasTools.PrintAsImageString(x) if not pd.isna(x) else x)
|
1051 |
else:
|
1052 |
df_html.drop(['Scaffold'], axis=1, inplace=True)
|
|
|
1076 |
df_html.rename(columns=column_aliases, inplace=True)
|
1077 |
df_html.index.name = 'Index'
|
1078 |
if 'Target FASTA' in df_html.columns:
|
1079 |
+
df_html['Target FASTA'] = df_html['Target FASTA'].parallel_apply(
|
1080 |
lambda x: wrap_text(x) if not pd.isna(x) else x)
|
1081 |
|
1082 |
num_cols = df_html.select_dtypes('number').columns
|
|
|
1094 |
if 'Target ID' in df_html.columns:
|
1095 |
df_html.drop(['Target FASTA'], axis=1, inplace=True)
|
1096 |
if 'Target FASTA' in df_html.columns:
|
1097 |
+
df_html['Target FASTA'] = df_html['Target FASTA'].parallel_apply(
|
1098 |
lambda x: wrap_text(x) if not pd.isna(x) else x)
|
1099 |
if 'Scaffold SMILES' in df_html.columns:
|
1100 |
df_html.drop(['Scaffold SMILES'], axis=1, inplace=True)
|
|
|
1159 |
|
1160 |
report_table = pn.widgets.Tabulator(
|
1161 |
df_html, formatters=formatters,
|
1162 |
+
frozen_columns=[col for col in df_html.columns if col in [
|
1163 |
+
'Index', 'Target ID', 'Compound ID', 'Compound', 'Scaffold'
|
1164 |
+
]],
|
1165 |
disabled=True, sizing_mode='stretch_both', pagination='local', page_size=30)
|
1166 |
|
1167 |
for i, col in enumerate(num_cols):
|
|
|
1316 |
("Percentage", "@proportion{0.0%}")
|
1317 |
]
|
1318 |
|
1319 |
+
if category == 'Scaffold SMILES' and 'Scaffold' in df.columns:
|
1320 |
data = data.merge(top_k_df[['Scaffold SMILES', 'Scaffold']].drop_duplicates(), how='left',
|
1321 |
left_on='Scaffold SMILES', right_on='Scaffold SMILES')
|
1322 |
tooltips.append(("Scaffold", "<div>@{Scaffold}{safe}</div>"))
|
|
|
1355 |
df_report = df.copy()
|
1356 |
try:
|
1357 |
for filter_name in filter_list:
|
1358 |
+
df_report[filter_name] = df_report['Compound'].parallel_apply(
|
1359 |
lambda x: FILTER_MAP[filter_name](x) if not pd.isna(x) else x)
|
1360 |
|
1361 |
for score_name in score_list:
|
1362 |
+
df_report[score_name] = df_report['Compound'].parallel_apply(
|
1363 |
lambda x: SCORE_MAP[score_name](x) if not pd.isna(x) else x)
|
1364 |
|
1365 |
return (create_html_report(df_report, file=None, task=task), df_report,
|
|
|
1992 |
alignment = aligner.align(processed_fasta, query)
|
1993 |
return alignment.score / max(len(processed_fasta), len(query))
|
1994 |
|
1995 |
+
alignment_df['score'] = alignment_df['X2'].parallel_apply(align_score)
|
1996 |
row = alignment_df.loc[alignment_df['score'].idxmax()]
|
1997 |
family = str(row['Target Family']).title()
|
1998 |
return gr.Dropdown(value=family,
|
|
|
2318 |
infer_df = pd.read_csv(drug_target_pair_upload)
|
2319 |
validate_columns(infer_df, ['X1', 'X2'])
|
2320 |
|
2321 |
+
infer_df['X1_ERR'] = infer_df['X1'].parallel_apply(
|
2322 |
validate_seq_str, regex=SMILES_PAT)
|
2323 |
if not infer_df['X1_ERR'].isna().all():
|
2324 |
raise ValueError(
|
2325 |
f"Encountered invalid SMILES:\n{infer_df[~infer_df['X1_ERR'].isna()][['X1', 'X1_ERR']]}")
|
2326 |
|
2327 |
+
infer_df['X2_ERR'] = infer_df['X2'].parallel_apply(
|
2328 |
validate_seq_str, regex=FASTA_PAT)
|
2329 |
if not infer_df['X2_ERR'].isna().all():
|
2330 |
raise ValueError(
|
|
|
2548 |
info=f'Found {label} in your uploaded dataset. '
|
2549 |
'Is it compound-protein interaction or binding affinity?'),
|
2550 |
html_report: ''}
|
2551 |
+
else:
|
2552 |
+
return {report_task: gr.Dropdown(visible=False)}
|
2553 |
|
2554 |
|
2555 |
report_df_change = file_for_report.change(
|
|
|
2566 |
concurrency_limit=100,
|
2567 |
).success(
|
2568 |
fn=inquire_task, inputs=[raw_df],
|
2569 |
+
outputs=[report_task, html_report],
|
2570 |
)
|
2571 |
|
2572 |
file_for_report.clear(
|