protein_properties

Running

App Files Files Community

jannisborn commited on Jan 24, 2023

Commit

480220c

0 Parent(s):

Duplicate from GT4SD/molecular_properties

Browse files

Files changed (10) hide show

.gitattributes +34 -0
.gitignore +1 -0
LICENSE +21 -0
README.md +16 -0
app.py +99 -0
model_cards/article.md +68 -0
model_cards/description.md +7 -0
model_cards/examples.smi +13 -0
requirements.txt +29 -0
utils.py +57 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,34 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ __pycache__/

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2022 Generative Toolkit 4 Scientific Discovery
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md ADDED Viewed

	@@ -0,0 +1,16 @@

+---
+title: Molecular properties
+emoji: 💡
+colorFrom: green
+colorTo: blue
+sdk: gradio
+sdk_version: 3.9.1
+app_file: app.py
+pinned: false
+python_version: 3.8.13
+pypi_version: 20.2.4
+duplicated_from: GT4SD/molecular_properties
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,99 @@

+import logging
+import pathlib
+import gradio as gr
+import numpy as np
+import pandas as pd
+from gt4sd.properties.molecules import MOLECULE_PROPERTY_PREDICTOR_FACTORY
+from utils import draw_grid_predict
+logger = logging.getLogger(__name__)
+logger.addHandler(logging.NullHandler())
+REMOVE = ["docking", "docking_tdc", "molecule_one", "askcos", "plogp"]
+REMOVE.extend(["similarity_seed", "activity_against_target", "organtox"])
+MODEL_PROP_DESCRIPTION = {
+    "Tox21": "NR-AR, NR-AR-LBD, NR-AhR, NR-Aromatase, NR-ER, NR-ER-LBD, NR-PPAR-gamma, SR-ARE, SR-ATAD5, SR-HSE, SR-MMP, SR-p53",
+    "Sider": "Hepatobiliary disorders,Metabolism and nutrition disorders,Product issues,Eye disorders,Investigations,Musculoskeletal disorders,Gastrointestinal disorders,Social circumstances,Immune system disorders,Reproductive system and breast disorders,Bening & malignant,General disorders,Endocrine disorders,Surgical & medical procedures,Vascular disorders,Blood & lymphatic disorders,Skin & subcutaneous disorders,Congenital & genetic disorders,Infections,Respiratory & thoracic disorders,Psychiatric disorders,Renal & urinary disorders,Pregnancy conditions,Ear disorders,Cardiac disorders,Nervous system disorders,Injury & procedural complications",
+    "Clintox": "FDA approval, Clinical trial failure",
+}
+def main(property: str, smiles: str, smiles_file: str):
+    algo, config = MOLECULE_PROPERTY_PREDICTOR_FACTORY[property.lower()]
+    kwargs = (
+        {"algorithm_version": "v0"} if property in MODEL_PROP_DESCRIPTION.keys() else {}
+    )
+    model = algo(config(**kwargs))
+    if smiles is not None and smiles_file is not None:
+        raise ValueError("Pass either smiles or smiles_file, not both.")
+    elif smiles is not None:
+        smiles = [smiles]
+    elif smiles_file is not None:
+        smiles = pd.read_csv(smiles_file.name, header=None, sep="\t")[0].tolist()
+    props = np.array(list(map(model, smiles))).round(2)
+    # Expand to 2D array if needed
+    if len(props.shape) == 1:
+        props = np.expand_dims(np.array(props), -1)
+    if property in MODEL_PROP_DESCRIPTION.keys():
+        property_names = MODEL_PROP_DESCRIPTION[property].split(",")
+    else:
+        property_names = [property]
+    return draw_grid_predict(
+        smiles, props, property_names=property_names, domain="Molecules"
+    )
+if __name__ == "__main__":
+    # Preparation (retrieve all available algorithms)
+    properties = list(MOLECULE_PROPERTY_PREDICTOR_FACTORY.keys())[::-1]
+    for prop in REMOVE:
+        prop_to_idx = dict(zip(properties, range(len(properties))))
+        properties.pop(prop_to_idx[prop])
+    properties = list(map(lambda x: x.capitalize(), properties))
+    # Load metadata
+    metadata_root = pathlib.Path(__file__).parent.joinpath("model_cards")
+    examples = [
+        ["Qed", None, metadata_root.joinpath("examples.smi")],
+        [
+            "Esol",
+            "CN1CCN(CCCOc2ccc(N3C(=O)C(=Cc4ccc(Oc5ccc([N+](=O)[O-])cc5)cc4)SC3=S)cc2)CC1",
+            None,
+        ],
+    ]
+    with open(metadata_root.joinpath("article.md"), "r") as f:
+        article = f.read()
+    with open(metadata_root.joinpath("description.md"), "r") as f:
+        description = f.read()
+    demo = gr.Interface(
+        fn=main,
+        title="Molecular properties",
+        inputs=[
+            gr.Dropdown(properties, label="Property", value="qed"),
+            gr.Textbox(
+                label="Single SMILES",
+                placeholder="CC(C#C)N(C)C(=O)NC1=CC=C(Cl)C=C1",
+                lines=1,
+            ),
+            gr.File(
+                file_types=[".smi"],
+                label="Multiple SMILES (tab-separated, `.smi` file)",
+            ),
+        ],
+        outputs=gr.HTML(label="Output"),
+        article=article,
+        description=description,
+        examples=examples,
+    )
+    demo.launch(debug=True, show_error=True)

model_cards/article.md ADDED Viewed

	@@ -0,0 +1,68 @@

+# Supported molecular properties
+### ClinTox
+A [ToxSmi model](https://github.com/PaccMann/toxsmi) trained on [ClinTox](https://moleculenet.org/datasets-1) dataset which has two endpoints: Probability of FDA approval and Probability of failure in clinical trials. When using this model, please cite *Born et al. (2023)* (citation below).
+### SIDER
+A [ToxSmi model](https://github.com/PaccMann/toxsmi) trained on the [SIDER](https://moleculenet.org/datasets-1) dataset for 27 different types of side effects of drugs. When using this model, please cite *Born et al. (2023)* (citation below).
+### Tox21
+A [ToxSmi model](https://github.com/PaccMann/toxsmi) trained on the [Tox21](https://tripod.nih.gov/tox/) dataset with 12 different types of environmental toxicities. When using this model, please cite *Born et al. (2023)* (citation below).
+### SCScore
+Predict the synthetic complexity score (SCScore) as presented in [Coley et al. (*J. Chem. Inf. Model.*; 2018)](https://pubs.acs.org/doi/full/10.1021/acs.jcim.7b00622).
+### SAS
+Estimate the synthetic accessibility score (SAS) as presented in [Ertl et al. (*Journal of Chemoinformatics*; 2009)](https://jcheminf.biomedcentral.com/articles/10.1186/1758-2946-1-8).
+### Lipinski
+Measure whether a molecule confirms to the Lipinski-rule-of-five as presented in [Lipinski et al. (*Advanced Drug Delivery Reviews*; 2001)](https://www.sciencedirect.com/science/article/abs/pii/S0169409X00001290?via%3Dihub).
+### Penalized logP
+Measure the penalized logP (partition coefficient) score as presented in [Gomez-Bombarelli et al. (*ACS Central Science*; 2018)](https://arxiv.org/abs/1610.02415v1). This is the logP minus the number of rings with > 6 atoms minus the SAS.
+### QED
+Measure the drug-likeness as presented in [Bickerton et al. (*Nature Chemistry*; 2012)](https://www.nature.com/articles/nchem.1243).
+### LogP
+Measure the logP (partition coefficient) of a molecule as presented in [Wildman et al. (*J. Chem. Inf. Comput. Sci.*; 1999)](https://pubs.acs.org/doi/full/10.1021/ci990307l).
+### Bertz
+Calculate the total polar surface area of a molecule as presented in [Ertl et al. (*Journal of Medicinal Chemistry*; 2000)](https://pubs.acs.org/doi/full/10.1021/jm000942e).
+### TPSA
+Calculate the first general index of molecular complexity [Bertz (*Journal of the American Chemical Society*; 1981)](https://pubs.acs.org/doi/pdf/10.1021/ja00402a071).
+### Is-Scaffold
+Whether the molecule is identical to its [Murcko scaffold](https://rdkit.org/docs/source/rdkit.Chem.Scaffolds.MurckoScaffold.html).
+### Number-Of-X
+Calculated with [RDKit](https://www.rdkit.org/docs/source/rdkit.Chem.rdchem.html).
+### Molecular Weight
+Calculated with [RDKit](https://www.rdkit.org/docs/source/rdkit.Chem.rdchem.html).
+### ToxSmi citation
+```bib
+@article{born2023chemical,
+  title={Chemical representation learning for toxicity prediction},
+  author={Born, Jannis and Markert, Greta and Janakarajan, Nikita and Kimber, Talia B. and Volkamer, Andrea and Rodriguez Martinez, Maria and Manica, Matteo},
+  journal={Under review at Digital Discovery},
+  year={2023}
+}
+```
+### Unsupported properties
+The following molecular properties are available via the GT4SD API but not in this UI:
+- [MoleculeOne](https://tdcommons.ai/functions/oracles/#moleculeone) endpoint for retrosynthesis
+- [ASKCOS](https://tdcommons.ai/functions/oracles/#askcos) endpoint for retrosynthesis
+- [TDC-Docking](https://tdcommons.ai/functions/oracles/#docking-scores) endpoint for docking against a user-provided target
+- [TDC-Docking](https://tdcommons.ai/functions/oracles/#docking-scores) endpoint for docking against *3pbl*.
+- [Protein-ligand binding](https://tdcommons.ai/functions/oracles/#dopamine-receptor-d2-drd2) against one of the targets *drd2*, *gsk3b*, *jnk3*, *fpscores*, *cyp3a4_veith*, *drd2_current*, *gsk3b_current* or *jnk3_current*.
+- [Tanimoto similarity](https://tdcommons.ai/functions/oracles/#similaritydissimilarity) to a seed molecule.
+Moreover, GT4SD also includes properties on other entities such as [proteins](https://gt4sd.github.io/gt4sd-core/api/gt4sd.properties.proteins.html) and [crystals](https://gt4sd.github.io/gt4sd-core/api/gt4sd.properties.crystals.html).

model_cards/description.md ADDED Viewed

	@@ -0,0 +1,7 @@

+<img align="right" src="https://raw.githubusercontent.com/GT4SD/gt4sd-core/main/docs/_static/gt4sd_logo.png" alt="logo" width="120" >
+### Molecular property prediction
+This is the GT4SD web-app for prediction of various molecular properties. For **examples** and **documentation** of the supported properties, please see below. Please note that this API does not expose **all** properties that are supported in GT4SD (a list of the non-supported ones can be found at the bottom).

model_cards/examples.smi ADDED Viewed

	@@ -0,0 +1,13 @@

+Cc1cc2c(c3oc(CCCC#N)cc13)C(=O)c1c(O)cccc1C2=O
+C=CCN1C(=O)C(=NNC(=S)NC2OC(COC(C)=O)C(OC(C)=O)C(OC(C)=O)C2OC(C)=O)c2ccccc21
+O=C1C(=Cc2ccc(F)cc2)CCOc2c1ccc1ccccc21
+CC(C)CNc1cc(NCC(C)C)nc(NCC(C)C)n1
+CN1CCN(CCCOc2ccc(N3C(=O)C(=Cc4ccc(Oc5ccc([N+](=O)[O-])cc5)cc4)SC3=S)cc2)CC1
+COc1ccc2ccccc2c1C1CC1NC(C)=O
+Cc1ccc(-n2c(=O)[nH]cc(C(=O)Nc3ccc4c(c3)OCCO4)c2=O)cc1
+Cc1ccc(NCc2nnc(SCC(=O)NCCc3ccccc3)n2C)cc1
+CCCNC(=O)c1ccc2c(c1)N=C(C)c1c(C)ccc(C)c1S2
+COc1ccc(Cn2ccn(CC(=O)Nc3cc(C)ccc3C)c(=O)c2=O)cc1
+Cn1nccc1C(=O)NN=Cc1c(O)ccc2ccccc12
+CCOC(=O)Nc1cc(N)c2c(n1)NC(C)C(c1ccccc1)=N2
+Cn1nc(N)c2ncc(C(Cl)(Cl)Cl)nc21

requirements.txt ADDED Viewed

	@@ -0,0 +1,29 @@

+-f https://download.pytorch.org/whl/cpu/torch_stable.html
+-f https://data.pyg.org/whl/torch-1.12.1+cpu.html
+# pip==20.2.4
+torch==1.12.1
+torch-scatter
+torch-spline-conv
+torch-sparse
+torch-geometric
+torchvision==0.13.1
+torchaudio==0.12.1
+gt4sd>=1.1.1
+molgx>=0.22.0a1
+molecule_generation
+nglview
+PyTDC==0.3.7
+gradio>=3.9
+markdown-it-py>=2.1.0
+mols2grid>=0.2.0
+pandas>=1.0.0
+pymatgen>=2023.1.9
+terminator @ git+https://github.com/IBM/regression-transformer@gt4sd
+guacamol_baselines @ git+https://github.com/GT4SD/[email protected]
+moses @ git+https://github.com/GT4SD/[email protected]
+paccmann_chemistry @ git+https://github.com/PaccMann/[email protected]
+paccmann_generator @ git+https://github.com/PaccMann/[email protected]
+paccmann_gp @ git+https://github.com/PaccMann/[email protected]
+paccmann_omics @ git+https://github.com/PaccMann/[email protected]
+paccmann_predictor @ git+https://github.com/PaccMann/paccmann_predictor@sarscov2
+reinvent_models @ git+https://github.com/GT4SD/[email protected]

utils.py ADDED Viewed

	@@ -0,0 +1,57 @@

+import logging
+from typing import List
+import numpy as np
+import mols2grid
+import pandas as pd
+from rdkit import Chem
+logger = logging.getLogger(__name__)
+logger.addHandler(logging.NullHandler())
+def draw_grid_predict(
+    sequences: List[str], properties: np.array, property_names: List[str], domain: str
+) -> str:
+    """
+    Uses mols2grid to draw a HTML grid for the prediction
+    Args:
+        sequences: Sequences for which properties are predicted.
+        properties: Predicted properties. Array of shape (n_samples, n_properties).
+        names: List of property names
+        domain: Domain of the prediction (molecules or proteins).
+    Returns:
+        HTML to display
+    """
+    if domain not in ["Molecules", "Proteins"]:
+        raise ValueError(f"Unsupported domain {domain}")
+    if domain == "Proteins":
+        converter = lambda x: Chem.MolToSmiles(Chem.MolFromFASTA(x))
+    else:
+        converter = lambda x: x
+    smiles = []
+    for sequence in sequences:
+        try:
+            seq = converter(sequence)
+            smiles.append(seq)
+        except Exception:
+            logger.warning(f"Could not draw sequence {seq}")
+    result = pd.DataFrame({"SMILES": smiles})
+    for i, name in enumerate(property_names):
+        result[name] = properties[:, i]
+    n_cols = min(3, len(result))
+    size = (140, 200) if len(result) > 3 else (600, 700)
+    obj = mols2grid.display(
+        result,
+        tooltip=list(result.keys()),
+        height=1100,
+        n_cols=n_cols,
+        name="Results",
+        size=size,
+    )
+    return obj.data