Spaces:

ChemFM
/

molecular_property_prediction

Sleeping

App Files Files Community

feiyang-cai commited on Oct 21, 2024

Commit

d84b0a6

1 Parent(s): 8b9fe11

revise the descriptions

Browse files

Files changed (3) hide show

app.py +20 -12
dataset_descriptions.json +44 -0
utils.py +21 -9

app.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import gradio as gr
 from huggingface_hub import HfApi, get_collection, list_collections
-from utils import MolecularPropertyPredictionModel, task_types, dataset_descriptions
 import pandas as pd
 import os
@@ -12,22 +12,26 @@ def get_models():
         if item.item_type == "model":
             item_name = item.item_id.split("/")[-1]
             models[item_name] = item.item_id
-            assert item_name in task_types, f"{item_name} is not in the task_types"
             assert item_name in dataset_descriptions, f"{item_name} is not in the dataset_descriptions"
     return models
 candidate_models = get_models()
-properties = list(candidate_models.keys())
 model = MolecularPropertyPredictionModel(candidate_models)
 def get_description(property_name):
-    return dataset_descriptions[property_name]
 def predict_single_label(smiles, property_name):
     try:
-        adapter_id = candidate_models[property_name]
-        info = model.swith_adapter(property_name, adapter_id)
         running_status = None
         if info == "keep":
@@ -45,7 +49,8 @@ def predict_single_label(smiles, property_name):
             return "NA", running_status
         #prediction = model.predict(smiles, property_name, adapter_id)
-        prediction = model.predict_single_smiles(smiles, task_types[property_name])
         if prediction is None:
             return "NA", "Invalid SMILES string"
@@ -60,9 +65,10 @@ def predict_single_label(smiles, property_name):
     return prediction, "Prediction is done"
 def predict_file(file, property_name):
     try:
-        adapter_id = candidate_models[property_name]
-        info = model.swith_adapter(property_name, adapter_id)
         running_status = None
         if info == "keep":
@@ -81,7 +87,7 @@ def predict_file(file, property_name):
         df = pd.read_csv(file)
         # we have already checked the file contains the "smiles" column
-        df = model.predict_file(df, task_types[property_name])
         # we should save this file to the disk to be downloaded
         # rename the file to have "_prediction" suffix
         prediction_file = file.replace(".csv", "_prediction.csv") if file.endswith(".csv") else file.replace(".smi", "_prediction.csv")
@@ -157,10 +163,12 @@ def build_inference():
     with gr.Blocks() as demo:
         # first row - Dropdown input
         #with gr.Row():
-        dropdown = gr.Dropdown(properties, label="Property", value=properties[0])
         description_box = gr.Textbox(label="Property description", lines=5,
                                      interactive=False,
-                                     value=dataset_descriptions[properties[0]])
         # third row - Textbox input and prediction label
         with gr.Row(equal_height=True):
             with gr.Column():

 import gradio as gr
 from huggingface_hub import HfApi, get_collection, list_collections
+from utils import MolecularPropertyPredictionModel, dataset_task_types, dataset_descriptions, dataset_property_names, dataset_property_names_to_dataset
 import pandas as pd
 import os
         if item.item_type == "model":
             item_name = item.item_id.split("/")[-1]
             models[item_name] = item.item_id
+            assert item_name in dataset_task_types, f"{item_name} is not in the task_types"
             assert item_name in dataset_descriptions, f"{item_name} is not in the dataset_descriptions"
     return models
 candidate_models = get_models()
+properties = [dataset_property_names[item] for item in candidate_models.keys()]
+property_names = list(candidate_models.keys())
 model = MolecularPropertyPredictionModel(candidate_models)
 def get_description(property_name):
+    property_id = dataset_property_names_to_dataset[property_name]
+    return dataset_descriptions[property_id]
 def predict_single_label(smiles, property_name):
+    property_id = dataset_property_names_to_dataset[property_name]
     try:
+        adapter_id = candidate_models[property_id]
+        info = model.swith_adapter(property_id, adapter_id)
         running_status = None
         if info == "keep":
             return "NA", running_status
         #prediction = model.predict(smiles, property_name, adapter_id)
+        print("hello4")
+        prediction = model.predict_single_smiles(smiles, dataset_task_types[property_id])
         if prediction is None:
             return "NA", "Invalid SMILES string"
     return prediction, "Prediction is done"
 def predict_file(file, property_name):
+    property_id = dataset_property_names_to_dataset[property_name]
     try:
+        adapter_id = candidate_models[property_id]
+        info = model.swith_adapter(property_id, adapter_id)
         running_status = None
         if info == "keep":
         df = pd.read_csv(file)
         # we have already checked the file contains the "smiles" column
+        df = model.predict_file(df, dataset_task_types[property_id])
         # we should save this file to the disk to be downloaded
         # rename the file to have "_prediction" suffix
         prediction_file = file.replace(".csv", "_prediction.csv") if file.endswith(".csv") else file.replace(".smi", "_prediction.csv")
     with gr.Blocks() as demo:
         # first row - Dropdown input
         #with gr.Row():
+        print(property_names[0].lower())
+        print(properties)
+        dropdown = gr.Dropdown(properties, label="Property", value=dataset_property_names[property_names[0].lower()])
         description_box = gr.Textbox(label="Property description", lines=5,
                                      interactive=False,
+                                     value=dataset_descriptions[property_names[0].lower()])
         # third row - Textbox input and prediction label
         with gr.Row(equal_height=True):
             with gr.Column():

dataset_descriptions.json CHANGED Viewed

@@ -1,112 +1,156 @@
 {
     "ADMET_Caco2_Wang": {
         "task_type": "regression",
         "description": "predict drug permeability, measured in cm/s, using the Caco-2 cell line as an in vitro model to simulate human intestinal tissue permeability",
         "num_molecules": 906
     },
     "ADMET_Bioavailability_Ma": {
         "task_type": "classification",
         "description": "predict oral bioavailability with binary labels, indicating the rate and extent a drug becomes available at its site of action",
         "num_molecules": 640
     },
     "ADMET_Lipophilicity_AstraZeneca": {
         "task_type": "regression",
         "description": "predict lipophilicity with continuous labels, measured as a log-ratio, indicating a drug's ability to dissolve in lipid environments",
         "num_molecules": 4200
     },
     "ADMET_Solubility_AqSolDB": {
         "task_type": "regression",
         "description": "predict aqueous solubility with continuous labels, measured in log mol/L, indicating a drug's ability to dissolve in water",
         "num_molecules": 9982
     },
     "ADMET_HIA_Hou": {
         "task_type": "classification",
         "description": "predict human intestinal absorption (HIA) with binary labels, indicating a drug's ability to be absorbed into the bloodstream",
         "num_molecules": 578
     },
     "ADMET_Pgp_Broccatelli": {
         "task_type": "classification",
         "description": "predict P-glycoprotein (Pgp) inhibition with binary labels, indicating a drug's potential to alter bioavailability and overcome multidrug resistance",
         "num_molecules": 1212
     },
     "ADMET_BBB_Martins": {
         "task_type": "classification",
         "description": "predict blood-brain barrier permeability with binary labels, indicating a drug's ability to penetrate the barrier to reach the brain",
         "num_molecules": 1915
     },
     "ADMET_PPBR_AZ": {
         "task_type": "regression",
         "description": "predict plasma protein binding rate with continuous labels, indicating the percentage of a drug bound to plasma proteins in the blood",
         "num_molecules": 1797
     },
     "ADMET_VDss_Lombardo": {
         "task_type": "regression",
         "description": "predict the volume of distribution at steady state (VDss), indicating drug concentration in tissues versus blood",
         "num_molecules": 1130
     },
     "ADMET_CYP2C9_Veith": {
         "task_type": "classification",
         "description": "predict CYP2C9 inhibition with binary labels, indicating the drug's ability to inhibit the CYP2C9 enzyme involved in metabolism",
         "num_molecules": 12092
     },
     "ADMET_CYP2D6_Veith": {
         "task_type": "classification",
         "description": "predict CYP2D6 inhibition with binary labels, indicating the drug's potential to inhibit the CYP2D6 enzyme involved in metabolism",
         "num_molecules": 13130
     },
     "ADMET_CYP3A4_Veith": {
         "task_type": "classification",
         "description": "predict CPY3A4 inhibition with binary labels, indicating the drug's ability to inhibit the CPY3A4 enzyme involved in metabolism",
         "num_molecules": 12328
     },
     "ADMET_CYP2C9_Substrate_CarbonMangels": {
         "task_type": "classification",
         "description": "predict whether a drug is a substrate of the CYP2C9 enzyme with binary labels, indicating its potential to be metabolized",
         "num_molecules": 666
     },
     "ADMET_CYP2D6_Substrate_CarbonMangels": {
         "task_type": "classification",
         "description": "predict whether a drug is a substrate of the CYP2D6 enzyme with binary labels, indicating its potential to be metabolized",
         "num_molecules": 664
     },
     "ADMET_CYP3A4_Substrate_CarbonMangels": {
         "task_type": "classification",
         "description": "predict whether a drug is a substrate of the CYP3A4 enzyme with binary labels, indicating its potential to be metabolized",
         "num_molecules": 667
     },
     "ADMET_Half_Life_Obach": {
         "task_type": "regression",
         "description": "predict the half-life duration of a drug, measured in hours, indicating the time for its concentration to reduce by half",
         "num_molecules": 667
     },
     "ADMET_Clearance_Hepatocyte_AZ": {
         "task_type": "regression",
         "description": "predict drug clearance, measured in \u03bcL/min/10^6 cells, from hepatocyte experiments, indicating the rate at which the drug is removed from body",
         "num_molecules": 1020
     },
     "ADMET_Clearance_Microsome_AZ": {
         "task_type": "regression",
         "description": "predict drug clearance, measured in mL/min/g, from microsome experiments, indicating the rate at which the drug is removed from body",
         "num_molecules": 1102
     },
     "ADMET_LD50_Zhu": {
         "task_type": "regression",
         "description": "predict the acute toxicity of a drug, measured as the dose leading to lethal effects in log(kg/mol)",
         "num_molecules": 7385
     },
     "ADMET_hERG": {
         "task_type": "classification",
         "description": "predict whether a drug blocks the hERG channel, which is crucial for heart rhythm, potentially leading to adverse effects",
         "num_molecules": 648
     },
     "ADMET_AMES": {
         "task_type": "classification",
         "description": "predict whether a drug is mutagenic with binary labels, indicating its ability to induce genetic alterations",
         "num_molecules": 7255
     },
     "ADMET_DILI": {
         "task_type": "classification",
         "description": "predict whether a drug can cause liver injury with binary labels, indicating its potential for hepatotoxicity",
         "num_molecules": 475
     }
 }

 {
     "ADMET_Caco2_Wang": {
         "task_type": "regression",
+        "task_name": "Drug Permeability",
         "description": "predict drug permeability, measured in cm/s, using the Caco-2 cell line as an in vitro model to simulate human intestinal tissue permeability",
+        "url": "https://tdcommons.ai/single_pred_tasks/adme#caco-2-cell-effective-permeability-wang-et-al",
         "num_molecules": 906
     },
     "ADMET_Bioavailability_Ma": {
         "task_type": "classification",
+        "task_name": "Drug Oral Bioavailability",
         "description": "predict oral bioavailability with binary labels, indicating the rate and extent a drug becomes available at its site of action",
+        "url": "https://tdcommons.ai/single_pred_tasks/adme#bioavailability-ma-et-al",
         "num_molecules": 640
     },
     "ADMET_Lipophilicity_AstraZeneca": {
         "task_type": "regression",
+        "task_name": "Drug Lipophilicity",
         "description": "predict lipophilicity with continuous labels, measured as a log-ratio, indicating a drug's ability to dissolve in lipid environments",
+        "url": "https://tdcommons.ai/single_pred_tasks/adme#lipophilicity-astrazeneca",
         "num_molecules": 4200
     },
     "ADMET_Solubility_AqSolDB": {
         "task_type": "regression",
+        "task_name": "Drug Aqueous Solubility",
         "description": "predict aqueous solubility with continuous labels, measured in log mol/L, indicating a drug's ability to dissolve in water",
+        "url": "https://tdcommons.ai/single_pred_tasks/adme#solubility-aqsoldb",
         "num_molecules": 9982
     },
     "ADMET_HIA_Hou": {
         "task_type": "classification",
+        "task_name": "Drug Human Intestinal Absorption",
         "description": "predict human intestinal absorption (HIA) with binary labels, indicating a drug's ability to be absorbed into the bloodstream",
+        "url": "https://tdcommons.ai/single_pred_tasks/adme#hia-human-intestinal-absorption-hou-et-al",
         "num_molecules": 578
     },
     "ADMET_Pgp_Broccatelli": {
         "task_type": "classification",
+        "task_name": "P-glycoprotein Inhibition",
         "description": "predict P-glycoprotein (Pgp) inhibition with binary labels, indicating a drug's potential to alter bioavailability and overcome multidrug resistance",
+        "url": "https://tdcommons.ai/single_pred_tasks/adme#pgp-p-glycoprotein-inhibition-broccatelli-et-al",
         "num_molecules": 1212
     },
     "ADMET_BBB_Martins": {
         "task_type": "classification",
+        "task_name": "Blood-Brain Barrier Permeability",
         "description": "predict blood-brain barrier permeability with binary labels, indicating a drug's ability to penetrate the barrier to reach the brain",
+        "url": "https://tdcommons.ai/single_pred_tasks/adme#bbb-blood-brain-barrier-martins-et-al",
         "num_molecules": 1915
     },
     "ADMET_PPBR_AZ": {
         "task_type": "regression",
+        "task_name": "Plasma Protein Binding Rate",
         "description": "predict plasma protein binding rate with continuous labels, indicating the percentage of a drug bound to plasma proteins in the blood",
+        "url": "https://tdcommons.ai/single_pred_tasks/adme#ppbr-plasma-protein-binding-rate-astrazeneca",
         "num_molecules": 1797
     },
     "ADMET_VDss_Lombardo": {
         "task_type": "regression",
+        "task_name": "Volume of Distribution at Steady State",
         "description": "predict the volume of distribution at steady state (VDss), indicating drug concentration in tissues versus blood",
+        "url": "https://tdcommons.ai/single_pred_tasks/adme#vdss-volumn-of-distribution-at-steady-state-lombardo-et-al",
         "num_molecules": 1130
     },
     "ADMET_CYP2C9_Veith": {
         "task_type": "classification",
+        "task_name": "CYP2C9 Inhibition",
         "description": "predict CYP2C9 inhibition with binary labels, indicating the drug's ability to inhibit the CYP2C9 enzyme involved in metabolism",
+        "url": "https://tdcommons.ai/single_pred_tasks/adme#cyp-p450-2c9-inhibition-veith-et-al",
         "num_molecules": 12092
     },
     "ADMET_CYP2D6_Veith": {
         "task_type": "classification",
+        "task_name": "CYP2D6 Inhibition",
         "description": "predict CYP2D6 inhibition with binary labels, indicating the drug's potential to inhibit the CYP2D6 enzyme involved in metabolism",
+        "url": "https://tdcommons.ai/single_pred_tasks/adme#cyp-p450-2d6-inhibition-veith-et-al",
         "num_molecules": 13130
     },
     "ADMET_CYP3A4_Veith": {
         "task_type": "classification",
+        "task_name": "CPY3A4 Inhibition",
         "description": "predict CPY3A4 inhibition with binary labels, indicating the drug's ability to inhibit the CPY3A4 enzyme involved in metabolism",
+        "url": "https://tdcommons.ai/single_pred_tasks/adme#cyp-p450-3a4-inhibition-veith-et-al",
         "num_molecules": 12328
     },
     "ADMET_CYP2C9_Substrate_CarbonMangels": {
         "task_type": "classification",
+        "task_name": "CYP2C9 Substrate",
         "description": "predict whether a drug is a substrate of the CYP2C9 enzyme with binary labels, indicating its potential to be metabolized",
+        "url": "https://tdcommons.ai/single_pred_tasks/adme#cyp2c9-substrate-carbon-mangels-et-al",
         "num_molecules": 666
     },
     "ADMET_CYP2D6_Substrate_CarbonMangels": {
         "task_type": "classification",
+        "task_name": "CYP2D6 Substrate",
         "description": "predict whether a drug is a substrate of the CYP2D6 enzyme with binary labels, indicating its potential to be metabolized",
+        "url": "https://tdcommons.ai/single_pred_tasks/adme#cyp2d6-substrate-carbon-mangels-et-al",
         "num_molecules": 664
     },
     "ADMET_CYP3A4_Substrate_CarbonMangels": {
         "task_type": "classification",
+        "task_name": "CYP3A4 Substrate",
         "description": "predict whether a drug is a substrate of the CYP3A4 enzyme with binary labels, indicating its potential to be metabolized",
+        "url": "https://tdcommons.ai/single_pred_tasks/adme#cyp3a4-substrate-carbon-mangels-et-al",
         "num_molecules": 667
     },
     "ADMET_Half_Life_Obach": {
         "task_type": "regression",
+        "task_name": "Drug Half-Life Duration",
         "description": "predict the half-life duration of a drug, measured in hours, indicating the time for its concentration to reduce by half",
+        "url": "https://tdcommons.ai/single_pred_tasks/adme#half-life-obach-et-al",
         "num_molecules": 667
     },
     "ADMET_Clearance_Hepatocyte_AZ": {
         "task_type": "regression",
+        "task_name": "Drug Clearance from Hepatocyte Experiments",
         "description": "predict drug clearance, measured in \u03bcL/min/10^6 cells, from hepatocyte experiments, indicating the rate at which the drug is removed from body",
+        "url": "https://tdcommons.ai/single_pred_tasks/adme#clearance-astrazeneca",
         "num_molecules": 1020
     },
     "ADMET_Clearance_Microsome_AZ": {
         "task_type": "regression",
+        "task_name": "Drug Clearance from Microsome Experiments",
         "description": "predict drug clearance, measured in mL/min/g, from microsome experiments, indicating the rate at which the drug is removed from body",
+        "url": "https://tdcommons.ai/single_pred_tasks/adme#clearance-astrazeneca",
         "num_molecules": 1102
     },
     "ADMET_LD50_Zhu": {
         "task_type": "regression",
+        "task_name": "Drug Acute Toxicity",
         "description": "predict the acute toxicity of a drug, measured as the dose leading to lethal effects in log(kg/mol)",
+        "url": "https://tdcommons.ai/single_pred_tasks/tox#acute-toxicity-ld50",
         "num_molecules": 7385
     },
     "ADMET_hERG": {
         "task_type": "classification",
+        "task_name": "hERG Channel Blockage",
         "description": "predict whether a drug blocks the hERG channel, which is crucial for heart rhythm, potentially leading to adverse effects",
+        "url": "https://tdcommons.ai/single_pred_tasks/tox#herg-blockers",
         "num_molecules": 648
     },
     "ADMET_AMES": {
         "task_type": "classification",
+        "task_name": "Drug Mutagenicity",
         "description": "predict whether a drug is mutagenic with binary labels, indicating its ability to induce genetic alterations",
+        "url": "https://tdcommons.ai/single_pred_tasks/tox#ames-mutagenicity",
         "num_molecules": 7255
     },
     "ADMET_DILI": {
         "task_type": "classification",
+        "task_name": "Drug-Induced Liver Injury",
         "description": "predict whether a drug can cause liver injury with binary labels, indicating its potential for hepatotoxicity",
+        "url": "https://tdcommons.ai/single_pred_tasks/tox#dili-drug-induced-liver-injury",
         "num_molecules": 475
     }
 }

utils.py CHANGED Viewed

@@ -39,22 +39,30 @@ from rdkit import RDLogger, Chem
 RDLogger.DisableLog('rdApp.*')
 # we have a dictionary to store the task types of the models
-task_types = {
-    "admet_ppbr_az": "regression",
-    "admet_half_life_obach": "regression",
-}
 # read the dataset descriptions
 with open("dataset_descriptions.json", "r") as f:
     dataset_description_temp = json.load(f)
 dataset_descriptions = dict()
 for dataset in dataset_description_temp:
     dataset_name = dataset.lower()
     dataset_descriptions[dataset_name] = \
-        f"{dataset_name} is a {dataset_description_temp[dataset]['task_type']} task, " + \
-        f"where the goal is to {dataset_description_temp[dataset]['description']}."
 class Scaler:
     def __init__(self, log=False):
@@ -215,7 +223,11 @@ class MolecularPropertyPredictionModel():
             adapter_id = candidate_models[adapter_name]
             print(f"loading {adapter_name} from {adapter_id}...")
             self.base_model.load_adapter(adapter_id, adapter_name=adapter_name, token = os.environ.get("TOKEN"))
-            self.apapter_scaler_path[adapter_name] = hf_hub_download(adapter_id, filename="scaler.pkl", token = os.environ.get("TOKEN"))
         #self.base_model.to("cuda")
         #print(self.base_model)
@@ -242,7 +254,7 @@ class MolecularPropertyPredictionModel():
                 #if adapter_name not in self.apapter_scaler_path:
                 #    self.apapter_scaler_path[adapter_name] = hf_hub_download(adapter_id, filename="scaler.pkl", token = os.environ.get("TOKEN"))
-                if os.path.exists(self.apapter_scaler_path[adapter_name]):
                     self.scaler = pickle.load(open(self.apapter_scaler_path[adapter_name], "rb"))
                 else:
                     self.scaler = None
@@ -276,7 +288,7 @@ class MolecularPropertyPredictionModel():
                 if task_type == "regression": # TODO: check if the model is regression or classification
                     y_pred.append(outputs.logits.cpu().detach().numpy())
                 else:
-                    y_pred.append((torch.sigmoid(outputs.logits) > 0.5).cpu().detach().numpy())
             y_pred = np.concatenate(y_pred, axis=0)
             if task_type=="regression" and self.scaler is not None:

 RDLogger.DisableLog('rdApp.*')
 # we have a dictionary to store the task types of the models
+#task_types = {
+#    "admet_bioavailability_ma": "classification",
+#    "admet_ppbr_az": "regression",
+#    "admet_half_life_obach": "regression",
+#}
 # read the dataset descriptions
 with open("dataset_descriptions.json", "r") as f:
     dataset_description_temp = json.load(f)
 dataset_descriptions = dict()
+dataset_property_names = dict()
+dataset_task_types = dict()
+dataset_property_names_to_dataset = dict()
 for dataset in dataset_description_temp:
     dataset_name = dataset.lower()
     dataset_descriptions[dataset_name] = \
+        f"{dataset_description_temp[dataset]['task_name']} is a {dataset_description_temp[dataset]['task_type']} task, " + \
+        f"where the goal is to {dataset_description_temp[dataset]['description']}. \n" + \
+        f"More information can be found at {dataset_description_temp[dataset]['url']}."
+    dataset_property_names[dataset_name] = dataset_description_temp[dataset]['task_name']
+    dataset_property_names_to_dataset[dataset_description_temp[dataset]['task_name']] = dataset_name
+    dataset_task_types[dataset_name] = dataset_description_temp[dataset]['task_type']
 class Scaler:
     def __init__(self, log=False):
             adapter_id = candidate_models[adapter_name]
             print(f"loading {adapter_name} from {adapter_id}...")
             self.base_model.load_adapter(adapter_id, adapter_name=adapter_name, token = os.environ.get("TOKEN"))
+            try:
+                self.apapter_scaler_path[adapter_name] = hf_hub_download(adapter_id, filename="scaler.pkl", token = os.environ.get("TOKEN"))
+            except:
+                self.apapter_scaler_path[adapter_name] = None
+                assert dataset_task_types[adapter_name] == "classification", f"{adapter_name} is not a regression task."
         #self.base_model.to("cuda")
         #print(self.base_model)
                 #if adapter_name not in self.apapter_scaler_path:
                 #    self.apapter_scaler_path[adapter_name] = hf_hub_download(adapter_id, filename="scaler.pkl", token = os.environ.get("TOKEN"))
+                if self.apapter_scaler_path[adapter_name] and os.path.exists(self.apapter_scaler_path[adapter_name]):
                     self.scaler = pickle.load(open(self.apapter_scaler_path[adapter_name], "rb"))
                 else:
                     self.scaler = None
                 if task_type == "regression": # TODO: check if the model is regression or classification
                     y_pred.append(outputs.logits.cpu().detach().numpy())
                 else:
+                    y_pred.append((torch.sigmoid(outputs.logits)).cpu().detach().numpy())
             y_pred = np.concatenate(y_pred, axis=0)
             if task_type=="regression" and self.scaler is not None: