Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Update app.py
Browse files
app.py
CHANGED
@@ -6,7 +6,7 @@ import textwrap
|
|
6 |
from email.mime.multipart import MIMEMultipart
|
7 |
from email.mime.text import MIMEText
|
8 |
from email.utils import formatdate, make_msgid
|
9 |
-
from functools import cache
|
10 |
from math import pi
|
11 |
from time import sleep, time
|
12 |
from uuid import uuid4
|
@@ -25,6 +25,7 @@ import hydra
|
|
25 |
import pandas as pd
|
26 |
from pandarallel import pandarallel
|
27 |
import requests
|
|
|
28 |
from requests.adapters import HTTPAdapter, Retry
|
29 |
from markdown import markdown
|
30 |
from rdkit import Chem, DataStructs
|
@@ -291,29 +292,24 @@ def check_expiry():
|
|
291 |
send_email(job)
|
292 |
|
293 |
|
294 |
-
|
295 |
-
def max_tanimoto_similarity(smi, seen_smiles):
|
296 |
if smi is None:
|
297 |
return 0
|
|
|
|
|
298 |
mol = Chem.MolFromSmiles(smi)
|
299 |
if mol is None:
|
300 |
return 0
|
301 |
mol_ecfp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048)
|
302 |
-
|
303 |
-
|
304 |
-
mol_seen = Chem.MolFromSmiles(smiles)
|
305 |
-
mol_seen_ecfp = AllChem.GetMorganFingerprintAsBitVect(mol_seen, radius=2, nBits=2048)
|
306 |
-
sim = DataStructs.TanimotoSimilarity(mol_ecfp, mol_seen_ecfp)
|
307 |
-
if sim == 1:
|
308 |
-
return 1
|
309 |
-
max_sim = max(sim, max_sim)
|
310 |
-
return max_sim
|
311 |
|
312 |
|
313 |
-
@cache
|
314 |
def max_sequence_identity(seq, seen_fastas):
|
315 |
if seq is None:
|
316 |
return 0
|
|
|
|
|
317 |
aligner = PairwiseAligner()
|
318 |
aligner.mode = 'local'
|
319 |
max_id = 0
|
@@ -328,16 +324,24 @@ def max_sequence_identity(seq, seen_fastas):
|
|
328 |
|
329 |
@cache
|
330 |
def get_seen_smiles(family, task):
|
|
|
|
|
|
|
|
|
331 |
seen_smiles = pd.read_csv(
|
332 |
-
f'data/benchmarks/seen_compounds/{
|
333 |
-
return seen_smiles
|
334 |
|
335 |
|
336 |
@cache
|
337 |
def get_seen_fastas(family, task):
|
|
|
|
|
|
|
|
|
338 |
seen_fastas = pd.read_csv(
|
339 |
-
f'data/benchmarks/seen_targets/{
|
340 |
-
return seen_fastas
|
341 |
|
342 |
|
343 |
@cache
|
@@ -709,7 +713,6 @@ def submit_predict(predict_filepath, task, preset, target_family, opts, state):
|
|
709 |
error = None
|
710 |
task_file_abbr = {'Compound-Protein Interaction': 'CPI', 'Compound-Protein Binding Affinity': 'CPA'}
|
711 |
predictions_file = None
|
712 |
-
|
713 |
df_training = pd.read_csv(f'data/complete_{TASK_MAP[task].lower()}_dataset.csv')
|
714 |
orig_df = pd.read_csv(predict_filepath)
|
715 |
alignment_df = get_fasta_family_map()
|
@@ -737,12 +740,9 @@ def submit_predict(predict_filepath, task, preset, target_family, opts, state):
|
|
737 |
if 'Target Family' not in orig_df.columns:
|
738 |
orig_df['Target Family'] = None
|
739 |
if orig_df['Target Family'].isna().any():
|
740 |
-
orig_df.loc[
|
741 |
-
orig_df['Target Family'].isna(), '
|
742 |
-
|
743 |
-
orig_df['Target Family'].isna(), 'X2'
|
744 |
-
].parallel_apply(detect_family)
|
745 |
-
|
746 |
detect_family.cache_clear()
|
747 |
|
748 |
orig_df = orig_df.merge(df_training[['X1', 'X2', 'Y']], on=['X1', 'X2'], how='left', indicator=False)
|
@@ -783,76 +783,82 @@ def submit_predict(predict_filepath, task, preset, target_family, opts, state):
|
|
783 |
prediction_df = pd.concat([prediction_df, predictions])
|
784 |
|
785 |
else:
|
786 |
-
predictions_file = f'{SERVER_DATA_DIR}/{job_id}_{task_file_abbr[task]}
|
787 |
task_value = TASK_MAP[task]
|
788 |
score = TASK_METRIC_MAP[task]
|
789 |
benchmark_df = pd.read_csv(f'data/benchmarks/{task_value}_test_metrics.csv')
|
790 |
predict_df = pd.read_csv(predict_filepath)
|
791 |
|
792 |
for family, subset in predict_df.groupby('Target Family'):
|
793 |
-
predict_subset_filepath =
|
|
|
|
|
794 |
subset.to_csv(predict_subset_filepath, index=False, na_rep='')
|
795 |
-
seen_compounds = get_seen_smiles(family, task_value)
|
796 |
|
|
|
797 |
if subset['X1'].iloc[0] in seen_compounds:
|
798 |
scenario = "Seen Compound"
|
799 |
else:
|
800 |
scenario = "Unseen Compound"
|
801 |
|
802 |
filtered_df = benchmark_df[(benchmark_df['Family'] == family.title())
|
803 |
-
& (benchmark_df['Scenario'] == scenario)
|
|
|
804 |
|
805 |
-
|
806 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
807 |
|
808 |
-
|
|
|
|
|
809 |
cfg = hydra.compose(
|
810 |
config_name="webserver_inference",
|
811 |
overrides=[f"task={task_value}",
|
812 |
f"preset={preset_value}",
|
813 |
-
# f"ckpt_path=D:/checkpoints/{preset_value}-{task_value}-{target_family}.ckpt",
|
814 |
f"ckpt_path=resources/checkpoints/{preset_value}-{task_value}-{target_family}.ckpt",
|
815 |
f"data.data_file='{str(predict_subset_filepath)}'"])
|
816 |
|
817 |
predictions, _ = predict(cfg)
|
818 |
predictions = pd.concat([pd.DataFrame(prediction) for prediction in predictions], ignore_index=True)
|
819 |
-
predictions['Source'] = f'Predicted ({
|
|
|
820 |
prediction_df = pd.concat([prediction_df, predictions])
|
821 |
|
822 |
prediction_df = prediction_df.merge(orig_df, on=['X1', 'X2'], how='left', indicator=False)
|
823 |
prediction_df = pd.concat([prediction_df, annotated_df], ignore_index=True)
|
824 |
|
825 |
-
# prediction_df['Max. Tanimoto Similarity'] = prediction_df.groupby('Target Family')['X1'].apply(
|
826 |
-
# lambda group: group.parallel_apply(
|
827 |
-
# max_tanimoto_similarity,
|
828 |
-
# seen_smiles=tuple(get_seen_smiles(family=group.name, task=task_value))
|
829 |
-
# )
|
830 |
-
# ).values
|
831 |
-
#
|
832 |
-
# prediction_df['Max. Sequence Identity'] = prediction_df.groupby('Target Family')['X2'].apply(
|
833 |
-
# lambda group: group.parallel_apply(
|
834 |
-
# max_sequence_identity,
|
835 |
-
# seen_fastas=tuple(get_seen_fastas(family=group.name, task=task_value))
|
836 |
-
# )
|
837 |
-
# ).values
|
838 |
if "Include Max. Tanimoto Similarity" in opts:
|
839 |
for family in prediction_df['Target Family'].unique():
|
840 |
-
|
841 |
-
|
842 |
-
|
843 |
-
|
844 |
-
|
|
|
|
|
|
|
845 |
)
|
846 |
-
|
|
|
847 |
if "Include Max. Sequence Identity" in opts:
|
848 |
for family in prediction_df['Target Family'].unique():
|
849 |
-
|
850 |
-
|
851 |
-
|
852 |
-
|
853 |
-
seen_fastas=tuple(get_seen_fastas(family=family, task=task_value))
|
854 |
)
|
855 |
-
|
|
|
856 |
prediction_df.drop(['N'], axis=1).to_csv(predictions_file, index=False, na_rep='')
|
857 |
status = "COMPLETED"
|
858 |
|
@@ -1968,9 +1974,8 @@ QALAHAYFAQYHDPDDEPVADPYDQSFESRDLLIDEWKSLTYDEVISFVPPPLDQEEMES
|
|
1968 |
return [None, family]
|
1969 |
|
1970 |
if family == 'General':
|
1971 |
-
seen_targets =
|
1972 |
-
|
1973 |
-
if process_target_fasta(fasta) in seen_targets['X2'].values:
|
1974 |
scenario = "Seen Target"
|
1975 |
else:
|
1976 |
scenario = "Unseen Target"
|
@@ -1979,16 +1984,14 @@ QALAHAYFAQYHDPDDEPVADPYDQSFESRDLLIDEWKSLTYDEVISFVPPPLDQEEMES
|
|
1979 |
& (benchmark_df['Type'] == 'General')]
|
1980 |
|
1981 |
else:
|
1982 |
-
seen_targets_general =
|
1983 |
-
|
1984 |
-
if process_target_fasta(fasta) in seen_targets_general['X2'].values:
|
1985 |
scenario_general = "Seen Target"
|
1986 |
else:
|
1987 |
scenario_general = "Unseen Target"
|
1988 |
|
1989 |
-
seen_targets_family =
|
1990 |
-
|
1991 |
-
if process_target_fasta(fasta) in seen_targets_family['X2'].values:
|
1992 |
scenario_family = "Seen Target"
|
1993 |
else:
|
1994 |
scenario_family = "Unseen Target"
|
@@ -2008,10 +2011,9 @@ QALAHAYFAQYHDPDDEPVADPYDQSFESRDLLIDEWKSLTYDEVISFVPPPLDQEEMES
|
|
2008 |
scenario = "Unseen Target (<0.85 sequence identity)"
|
2009 |
|
2010 |
return {drug_screen_preset:
|
2011 |
-
|
2012 |
-
|
2013 |
-
|
2014 |
-
f"on {row['Family']}."),
|
2015 |
drug_screen_target_family:
|
2016 |
gr.Dropdown(value='General') if row['Type'] == 'General' else gr.Dropdown(value=family)}
|
2017 |
|
@@ -2569,4 +2571,4 @@ if __name__ == "__main__":
|
|
2569 |
hydra.initialize(version_base="1.3", config_path="configs", job_name="webserver_inference")
|
2570 |
demo.queue(default_concurrency_limit=None, max_size=10).launch(show_api=False)
|
2571 |
scheduler.add_job(check_expiry, 'interval', hours=1)
|
2572 |
-
scheduler.start()
|
|
|
6 |
from email.mime.multipart import MIMEMultipart
|
7 |
from email.mime.text import MIMEText
|
8 |
from email.utils import formatdate, make_msgid
|
9 |
+
from functools import cache, partial
|
10 |
from math import pi
|
11 |
from time import sleep, time
|
12 |
from uuid import uuid4
|
|
|
25 |
import pandas as pd
|
26 |
from pandarallel import pandarallel
|
27 |
import requests
|
28 |
+
from rdkit.DataStructs import BulkTanimotoSimilarity
|
29 |
from requests.adapters import HTTPAdapter, Retry
|
30 |
from markdown import markdown
|
31 |
from rdkit import Chem, DataStructs
|
|
|
292 |
send_email(job)
|
293 |
|
294 |
|
295 |
+
def max_tanimoto_similarity(smi, seen_smiles_with_fp):
|
|
|
296 |
if smi is None:
|
297 |
return 0
|
298 |
+
if smi in seen_smiles_with_fp['X1'].values:
|
299 |
+
return 1
|
300 |
mol = Chem.MolFromSmiles(smi)
|
301 |
if mol is None:
|
302 |
return 0
|
303 |
mol_ecfp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048)
|
304 |
+
sims = BulkTanimotoSimilarity(mol_ecfp, seen_smiles_with_fp['FP'])
|
305 |
+
return max(sims)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
306 |
|
307 |
|
|
|
308 |
def max_sequence_identity(seq, seen_fastas):
|
309 |
if seq is None:
|
310 |
return 0
|
311 |
+
if seq in seen_fastas:
|
312 |
+
return 1
|
313 |
aligner = PairwiseAligner()
|
314 |
aligner.mode = 'local'
|
315 |
max_id = 0
|
|
|
324 |
|
325 |
@cache
|
326 |
def get_seen_smiles(family, task):
|
327 |
+
if family == 'General':
|
328 |
+
family = 'all_families_full'
|
329 |
+
else:
|
330 |
+
family = TARGET_FAMILY_MAP[family.title()]
|
331 |
seen_smiles = pd.read_csv(
|
332 |
+
f'data/benchmarks/seen_compounds/{family}_{task.lower()}_random_split.csv')
|
333 |
+
return seen_smiles
|
334 |
|
335 |
|
336 |
@cache
|
337 |
def get_seen_fastas(family, task):
|
338 |
+
if family == 'General':
|
339 |
+
family = 'all_families_full'
|
340 |
+
else:
|
341 |
+
family = TARGET_FAMILY_MAP[family.title()]
|
342 |
seen_fastas = pd.read_csv(
|
343 |
+
f'data/benchmarks/seen_targets/{family}_{task.lower()}_random_split.csv')
|
344 |
+
return seen_fastas
|
345 |
|
346 |
|
347 |
@cache
|
|
|
713 |
error = None
|
714 |
task_file_abbr = {'Compound-Protein Interaction': 'CPI', 'Compound-Protein Binding Affinity': 'CPA'}
|
715 |
predictions_file = None
|
|
|
716 |
df_training = pd.read_csv(f'data/complete_{TASK_MAP[task].lower()}_dataset.csv')
|
717 |
orig_df = pd.read_csv(predict_filepath)
|
718 |
alignment_df = get_fasta_family_map()
|
|
|
740 |
if 'Target Family' not in orig_df.columns:
|
741 |
orig_df['Target Family'] = None
|
742 |
if orig_df['Target Family'].isna().any():
|
743 |
+
orig_df.loc[orig_df['Target Family'].isna(), 'Target Family'] = (
|
744 |
+
orig_df.loc[orig_df['Target Family'].isna(), 'X2'].parallel_apply(detect_family)
|
745 |
+
)
|
|
|
|
|
|
|
746 |
detect_family.cache_clear()
|
747 |
|
748 |
orig_df = orig_df.merge(df_training[['X1', 'X2', 'Y']], on=['X1', 'X2'], how='left', indicator=False)
|
|
|
783 |
prediction_df = pd.concat([prediction_df, predictions])
|
784 |
|
785 |
else:
|
786 |
+
predictions_file = f'{SERVER_DATA_DIR}/{job_id}_{task_file_abbr[task]}_family-recommended_predictions.csv'
|
787 |
task_value = TASK_MAP[task]
|
788 |
score = TASK_METRIC_MAP[task]
|
789 |
benchmark_df = pd.read_csv(f'data/benchmarks/{task_value}_test_metrics.csv')
|
790 |
predict_df = pd.read_csv(predict_filepath)
|
791 |
|
792 |
for family, subset in predict_df.groupby('Target Family'):
|
793 |
+
predict_subset_filepath = os.path.join(
|
794 |
+
os.path.dirname(predict_filepath), f'{job_id}_{family}_input.csv'
|
795 |
+
)
|
796 |
subset.to_csv(predict_subset_filepath, index=False, na_rep='')
|
|
|
797 |
|
798 |
+
seen_compounds = get_seen_smiles(family, task_value)['X1'].values
|
799 |
if subset['X1'].iloc[0] in seen_compounds:
|
800 |
scenario = "Seen Compound"
|
801 |
else:
|
802 |
scenario = "Unseen Compound"
|
803 |
|
804 |
filtered_df = benchmark_df[(benchmark_df['Family'] == family.title())
|
805 |
+
& (benchmark_df['Scenario'] == scenario)
|
806 |
+
& (benchmark_df['Type'] == 'Family')]
|
807 |
|
808 |
+
seen_compounds = get_seen_smiles('General', task_value)['X1'].values
|
809 |
+
if subset['X1'].iloc[0] in seen_compounds:
|
810 |
+
scenario = "Seen Compound"
|
811 |
+
else:
|
812 |
+
scenario = "Unseen Compound"
|
813 |
+
|
814 |
+
filtered_df = pd.concat([
|
815 |
+
filtered_df,
|
816 |
+
benchmark_df[(benchmark_df['Family'] == family.title())
|
817 |
+
& (benchmark_df['Scenario'] == scenario)
|
818 |
+
& (benchmark_df['Type'] == 'General')]
|
819 |
+
])
|
820 |
|
821 |
+
row = filtered_df.loc[filtered_df[score].idxmax()]
|
822 |
+
preset_value = PRESET_MAP[row['Model']]
|
823 |
+
target_family = TARGET_FAMILY_MAP[family.title()] if row['Type'] == 'Family' else 'general'
|
824 |
cfg = hydra.compose(
|
825 |
config_name="webserver_inference",
|
826 |
overrides=[f"task={task_value}",
|
827 |
f"preset={preset_value}",
|
|
|
828 |
f"ckpt_path=resources/checkpoints/{preset_value}-{task_value}-{target_family}.ckpt",
|
829 |
f"data.data_file='{str(predict_subset_filepath)}'"])
|
830 |
|
831 |
predictions, _ = predict(cfg)
|
832 |
predictions = pd.concat([pd.DataFrame(prediction) for prediction in predictions], ignore_index=True)
|
833 |
+
predictions['Source'] = (f'Predicted ({row["Model"]} '
|
834 |
+
f'{family.title() if row["Type"] == "Family" else "General"})')
|
835 |
prediction_df = pd.concat([prediction_df, predictions])
|
836 |
|
837 |
prediction_df = prediction_df.merge(orig_df, on=['X1', 'X2'], how='left', indicator=False)
|
838 |
prediction_df = pd.concat([prediction_df, annotated_df], ignore_index=True)
|
839 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
840 |
if "Include Max. Tanimoto Similarity" in opts:
|
841 |
for family in prediction_df['Target Family'].unique():
|
842 |
+
family_smiles_df = get_seen_smiles(family=family, task=task_value)
|
843 |
+
family_smiles_df['FP'] = family_smiles_df['X1'].parallel_apply(
|
844 |
+
lambda smiles: AllChem.GetMorganFingerprintAsBitVect(
|
845 |
+
Chem.MolFromSmiles(smiles), radius=2, nBits=2048)
|
846 |
+
)
|
847 |
+
max_sim = cache(partial(max_tanimoto_similarity, seen_smiles_with_fp=family_smiles_df))
|
848 |
+
prediction_df.loc[prediction_df['Target Family'] == family, 'Max. Tanimoto Similarity'] = (
|
849 |
+
prediction_df.loc[prediction_df['Target Family'] == family, 'X1'].parallel_apply(max_sim)
|
850 |
)
|
851 |
+
max_sim.cache_clear()
|
852 |
+
|
853 |
if "Include Max. Sequence Identity" in opts:
|
854 |
for family in prediction_df['Target Family'].unique():
|
855 |
+
family_fastas_df = get_seen_fastas(family=family, task=task_value)
|
856 |
+
max_id = cache(partial(max_sequence_identity, seen_fastas=family_fastas_df['X2'].values))
|
857 |
+
prediction_df.loc[prediction_df['Target Family'] == family, 'Max. Sequence Identity'] = (
|
858 |
+
prediction_df.loc[prediction_df['Target Family'] == family, 'X2'].parallel_apply(max_id)
|
|
|
859 |
)
|
860 |
+
max_id.cache_clear()
|
861 |
+
|
862 |
prediction_df.drop(['N'], axis=1).to_csv(predictions_file, index=False, na_rep='')
|
863 |
status = "COMPLETED"
|
864 |
|
|
|
1974 |
return [None, family]
|
1975 |
|
1976 |
if family == 'General':
|
1977 |
+
seen_targets = get_seen_fastas('General', task)['X2'].values
|
1978 |
+
if process_target_fasta(fasta) in seen_targets:
|
|
|
1979 |
scenario = "Seen Target"
|
1980 |
else:
|
1981 |
scenario = "Unseen Target"
|
|
|
1984 |
& (benchmark_df['Type'] == 'General')]
|
1985 |
|
1986 |
else:
|
1987 |
+
seen_targets_general = get_seen_fastas('General', task)['X2'].values
|
1988 |
+
if process_target_fasta(fasta) in seen_targets_general:
|
|
|
1989 |
scenario_general = "Seen Target"
|
1990 |
else:
|
1991 |
scenario_general = "Unseen Target"
|
1992 |
|
1993 |
+
seen_targets_family = get_seen_fastas(family, task)['X2'].values
|
1994 |
+
if process_target_fasta(fasta) in seen_targets_family:
|
|
|
1995 |
scenario_family = "Seen Target"
|
1996 |
else:
|
1997 |
scenario_family = "Unseen Target"
|
|
|
2011 |
scenario = "Unseen Target (<0.85 sequence identity)"
|
2012 |
|
2013 |
return {drug_screen_preset:
|
2014 |
+
gr.Dropdown(value=row['Model'],
|
2015 |
+
info=f"Reason: {row['Scenario']} in training; we recommend the {row['Type']}-trained "
|
2016 |
+
f"model with the best {score} in the {scenario} scenario on {row['Family']}."),
|
|
|
2017 |
drug_screen_target_family:
|
2018 |
gr.Dropdown(value='General') if row['Type'] == 'General' else gr.Dropdown(value=family)}
|
2019 |
|
|
|
2571 |
hydra.initialize(version_base="1.3", config_path="configs", job_name="webserver_inference")
|
2572 |
demo.queue(default_concurrency_limit=None, max_size=10).launch(show_api=False)
|
2573 |
scheduler.add_job(check_expiry, 'interval', hours=1)
|
2574 |
+
scheduler.start()
|