jannisborn commited on
Commit
c351b1e
1 Parent(s): 480220c
Files changed (5) hide show
  1. README.md +1 -1
  2. app.py +33 -47
  3. model_cards/description.md +2 -2
  4. model_cards/examples.smi +14 -13
  5. utils.py +2 -0
README.md CHANGED
@@ -1,5 +1,5 @@
1
  ---
2
- title: Molecular properties
3
  emoji: 💡
4
  colorFrom: green
5
  colorTo: blue
 
1
  ---
2
+ title: Protein properties
3
  emoji: 💡
4
  colorFrom: green
5
  colorTo: blue
app.py CHANGED
@@ -4,71 +4,60 @@ import pathlib
4
  import gradio as gr
5
  import numpy as np
6
  import pandas as pd
7
- from gt4sd.properties.molecules import MOLECULE_PROPERTY_PREDICTOR_FACTORY
8
 
9
  from utils import draw_grid_predict
10
 
11
  logger = logging.getLogger(__name__)
12
  logger.addHandler(logging.NullHandler())
13
 
14
- REMOVE = ["docking", "docking_tdc", "molecule_one", "askcos", "plogp"]
15
- REMOVE.extend(["similarity_seed", "activity_against_target", "organtox"])
16
 
17
- MODEL_PROP_DESCRIPTION = {
18
- "Tox21": "NR-AR, NR-AR-LBD, NR-AhR, NR-Aromatase, NR-ER, NR-ER-LBD, NR-PPAR-gamma, SR-ARE, SR-ATAD5, SR-HSE, SR-MMP, SR-p53",
19
- "Sider": "Hepatobiliary disorders,Metabolism and nutrition disorders,Product issues,Eye disorders,Investigations,Musculoskeletal disorders,Gastrointestinal disorders,Social circumstances,Immune system disorders,Reproductive system and breast disorders,Bening & malignant,General disorders,Endocrine disorders,Surgical & medical procedures,Vascular disorders,Blood & lymphatic disorders,Skin & subcutaneous disorders,Congenital & genetic disorders,Infections,Respiratory & thoracic disorders,Psychiatric disorders,Renal & urinary disorders,Pregnancy conditions,Ear disorders,Cardiac disorders,Nervous system disorders,Injury & procedural complications",
20
- "Clintox": "FDA approval, Clinical trial failure",
21
- }
22
 
23
 
24
- def main(property: str, smiles: str, smiles_file: str):
25
 
26
- algo, config = MOLECULE_PROPERTY_PREDICTOR_FACTORY[property.lower()]
27
- kwargs = (
28
- {"algorithm_version": "v0"} if property in MODEL_PROP_DESCRIPTION.keys() else {}
29
- )
 
 
 
 
 
30
  model = algo(config(**kwargs))
31
- if smiles is not None and smiles_file is not None:
32
- raise ValueError("Pass either smiles or smiles_file, not both.")
33
- elif smiles is not None:
34
- smiles = [smiles]
35
- elif smiles_file is not None:
36
- smiles = pd.read_csv(smiles_file.name, header=None, sep="\t")[0].tolist()
37
- props = np.array(list(map(model, smiles))).round(2)
 
 
38
 
39
  # Expand to 2D array if needed
40
  if len(props.shape) == 1:
41
  props = np.expand_dims(np.array(props), -1)
42
 
43
- if property in MODEL_PROP_DESCRIPTION.keys():
44
- property_names = MODEL_PROP_DESCRIPTION[property].split(",")
45
- else:
46
- property_names = [property]
47
-
48
- return draw_grid_predict(
49
- smiles, props, property_names=property_names, domain="Molecules"
50
- )
51
 
52
 
53
  if __name__ == "__main__":
54
 
55
  # Preparation (retrieve all available algorithms)
56
- properties = list(MOLECULE_PROPERTY_PREDICTOR_FACTORY.keys())[::-1]
57
- for prop in REMOVE:
58
- prop_to_idx = dict(zip(properties, range(len(properties))))
59
- properties.pop(prop_to_idx[prop])
60
  properties = list(map(lambda x: x.capitalize(), properties))
61
 
62
  # Load metadata
63
  metadata_root = pathlib.Path(__file__).parent.joinpath("model_cards")
64
 
65
  examples = [
66
- ["Qed", None, metadata_root.joinpath("examples.smi")],
67
- [
68
- "Esol",
69
- "CN1CCN(CCCOc2ccc(N3C(=O)C(=Cc4ccc(Oc5ccc([N+](=O)[O-])cc5)cc4)SC3=S)cc2)CC1",
70
- None,
71
- ],
72
  ]
73
 
74
  with open(metadata_root.joinpath("article.md"), "r") as f:
@@ -78,18 +67,15 @@ if __name__ == "__main__":
78
 
79
  demo = gr.Interface(
80
  fn=main,
81
- title="Molecular properties",
82
  inputs=[
83
- gr.Dropdown(properties, label="Property", value="qed"),
84
  gr.Textbox(
85
- label="Single SMILES",
86
- placeholder="CC(C#C)N(C)C(=O)NC1=CC=C(Cl)C=C1",
87
- lines=1,
88
- ),
89
- gr.File(
90
- file_types=[".smi"],
91
- label="Multiple SMILES (tab-separated, `.smi` file)",
92
  ),
 
 
 
93
  ],
94
  outputs=gr.HTML(label="Output"),
95
  article=article,
 
4
  import gradio as gr
5
  import numpy as np
6
  import pandas as pd
7
+ from gt4sd.properties.proteins import PROTEIN_PROPERTY_PREDICTOR_FACTORY
8
 
9
  from utils import draw_grid_predict
10
 
11
  logger = logging.getLogger(__name__)
12
  logger.addHandler(logging.NullHandler())
13
 
 
 
14
 
15
+ AMIDE_FNS = ["protein_weight", "charge", "charge_density", "isoelectric_point"]
16
+ PH_FNS = ["charge", "charge_density", "isoelectric_point"]
 
 
 
17
 
18
 
19
+ def main(property: str, seq: str, seq_file: str, amide: bool, ph: float):
20
 
21
+ prop_name = property.lower()
22
+ algo, config = PROTEIN_PROPERTY_PREDICTOR_FACTORY[prop_name]
23
+
24
+ # Pass hyperparameters if applicable
25
+ kwargs = {}
26
+ if prop_name in AMIDE_FNS:
27
+ kwargs["amide"] = amide
28
+ if prop_name in PH_FNS:
29
+ kwargs["ph"] = ph
30
  model = algo(config(**kwargs))
31
+
32
+ # Read and parse data
33
+ if seq is not None and seq_file is not None:
34
+ raise ValueError("Pass either smiles or seq_file, not both.")
35
+ elif seq is not None:
36
+ seqs = [seq]
37
+ elif seq_file is not None:
38
+ seqs = pd.read_csv(seq_file.name, header=None, sep="\t")[0].tolist()
39
+ props = np.array(list(map(model, seqs))).round(2)
40
 
41
  # Expand to 2D array if needed
42
  if len(props.shape) == 1:
43
  props = np.expand_dims(np.array(props), -1)
44
 
45
+ return draw_grid_predict(seqs, props, property_names=[property], domain="Proteins")
 
 
 
 
 
 
 
46
 
47
 
48
  if __name__ == "__main__":
49
 
50
  # Preparation (retrieve all available algorithms)
51
+ properties = list(PROTEIN_PROPERTY_PREDICTOR_FACTORY.keys())[::-1]
 
 
 
52
  properties = list(map(lambda x: x.capitalize(), properties))
53
 
54
  # Load metadata
55
  metadata_root = pathlib.Path(__file__).parent.joinpath("model_cards")
56
 
57
  examples = [
58
+ ["Aliphaticity", None, metadata_root.joinpath("examples.smi"), False, 7],
59
+ ["Isoelectric_point", "KFLIYQMECSTMIFGL", None, False, 7],
60
+ ["Charge", "KFLIYQMECSTMIFGL", None, True, 12],
 
 
 
61
  ]
62
 
63
  with open(metadata_root.joinpath("article.md"), "r") as f:
 
67
 
68
  demo = gr.Interface(
69
  fn=main,
70
+ title="Protein properties",
71
  inputs=[
72
+ gr.Dropdown(properties, label="Property", value="Instability"),
73
  gr.Textbox(
74
+ label="Single Protein sequence", placeholder="KFLIYQMECSTMIFGL", lines=1
 
 
 
 
 
 
75
  ),
76
+ gr.File(file_types=[".smi"], label="One AAS per line"),
77
+ gr.Radio(choices=[True, False], label="Amide", value=True),
78
+ gr.Slider(minimum=0, maximum=14, value=7, label="pH", description="Blub"),
79
  ],
80
  outputs=gr.HTML(label="Output"),
81
  article=article,
model_cards/description.md CHANGED
@@ -2,6 +2,6 @@
2
 
3
  <img align="right" src="https://raw.githubusercontent.com/GT4SD/gt4sd-core/main/docs/_static/gt4sd_logo.png" alt="logo" width="120" >
4
 
5
- ### Molecular property prediction
6
 
7
- This is the GT4SD web-app for prediction of various molecular properties. For **examples** and **documentation** of the supported properties, please see below. Please note that this API does not expose **all** properties that are supported in GT4SD (a list of the non-supported ones can be found at the bottom).
 
2
 
3
  <img align="right" src="https://raw.githubusercontent.com/GT4SD/gt4sd-core/main/docs/_static/gt4sd_logo.png" alt="logo" width="120" >
4
 
5
+ ### Protein property prediction
6
 
7
+ This is the GT4SD web-app for prediction of various protein (or peptide) properties. For **examples** and **documentation** of the supported properties, please see below. Please note that this API does not expose **all** properties that are supported in GT4SD (a list of the non-supported ones can be found at the bottom).
model_cards/examples.smi CHANGED
@@ -1,13 +1,14 @@
1
- Cc1cc2c(c3oc(CCCC#N)cc13)C(=O)c1c(O)cccc1C2=O
2
- C=CCN1C(=O)C(=NNC(=S)NC2OC(COC(C)=O)C(OC(C)=O)C(OC(C)=O)C2OC(C)=O)c2ccccc21
3
- O=C1C(=Cc2ccc(F)cc2)CCOc2c1ccc1ccccc21
4
- CC(C)CNc1cc(NCC(C)C)nc(NCC(C)C)n1
5
- CN1CCN(CCCOc2ccc(N3C(=O)C(=Cc4ccc(Oc5ccc([N+](=O)[O-])cc5)cc4)SC3=S)cc2)CC1
6
- COc1ccc2ccccc2c1C1CC1NC(C)=O
7
- Cc1ccc(-n2c(=O)[nH]cc(C(=O)Nc3ccc4c(c3)OCCO4)c2=O)cc1
8
- Cc1ccc(NCc2nnc(SCC(=O)NCCc3ccccc3)n2C)cc1
9
- CCCNC(=O)c1ccc2c(c1)N=C(C)c1c(C)ccc(C)c1S2
10
- COc1ccc(Cn2ccn(CC(=O)Nc3cc(C)ccc3C)c(=O)c2=O)cc1
11
- Cn1nccc1C(=O)NN=Cc1c(O)ccc2ccccc12
12
- CCOC(=O)Nc1cc(N)c2c(n1)NC(C)C(c1ccccc1)=N2
13
- Cn1nc(N)c2ncc(C(Cl)(Cl)Cl)nc21
 
 
1
+
2
+ NEGVKAAW
3
+ FPRPWLHGL
4
+ HPVGEADYFEY
5
+ TPGPGVRYPL
6
+ EEYLKAWTF
7
+ RMFPNAPYL
8
+ GPGMKARVL
9
+ RLRPGGKKK
10
+ VMAPRTLIL
11
+ ARMILMTHF
12
+ FLYNLLTRV
13
+ SLYNTVATL
14
+ ILKEPVHGV
utils.py CHANGED
@@ -42,6 +42,8 @@ def draw_grid_predict(
42
  logger.warning(f"Could not draw sequence {seq}")
43
 
44
  result = pd.DataFrame({"SMILES": smiles})
 
 
45
  for i, name in enumerate(property_names):
46
  result[name] = properties[:, i]
47
  n_cols = min(3, len(result))
 
42
  logger.warning(f"Could not draw sequence {seq}")
43
 
44
  result = pd.DataFrame({"SMILES": smiles})
45
+ if domain == "Proteins":
46
+ result["Seqs"] = sequences
47
  for i, name in enumerate(property_names):
48
  result[name] = properties[:, i]
49
  n_cols = min(3, len(result))