jannisborn
commited on
Commit
•
8ae3405
1
Parent(s):
34ae1d8
update
Browse files- README.md +1 -1
- app.py +91 -28
- model_cards/article.md +25 -21
- model_cards/depr_description.md +8 -0
- model_cards/description.md +3 -1
README.md
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
---
|
2 |
-
title: GT4SD -
|
3 |
emoji: 💡
|
4 |
colorFrom: green
|
5 |
colorTo: blue
|
|
|
1 |
---
|
2 |
+
title: GT4SD - MOSES
|
3 |
emoji: 💡
|
4 |
colorFrom: green
|
5 |
colorTo: blue
|
app.py
CHANGED
@@ -3,46 +3,107 @@ import pathlib
|
|
3 |
|
4 |
import gradio as gr
|
5 |
import pandas as pd
|
6 |
-
from gt4sd.algorithms.
|
7 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
from gt4sd.algorithms.registry import ApplicationsRegistry
|
|
|
9 |
from utils import draw_grid_generate
|
10 |
|
11 |
logger = logging.getLogger(__name__)
|
12 |
logger.addHandler(logging.NullHandler())
|
13 |
|
14 |
-
TITLE = "
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
|
16 |
|
17 |
def run_inference(
|
18 |
algorithm_version: str,
|
19 |
-
|
20 |
-
|
|
|
|
|
|
|
21 |
number_of_samples: int,
|
22 |
-
seed: int,
|
23 |
):
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
samples = list(model.sample(number_of_samples))
|
34 |
|
35 |
-
|
36 |
-
return draw_grid_generate(seed_mols, samples)
|
37 |
|
38 |
|
39 |
if __name__ == "__main__":
|
40 |
|
41 |
# Preparation (retrieve all available algorithms)
|
42 |
all_algos = ApplicationsRegistry.list_available()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
algos = [
|
44 |
-
x["
|
45 |
-
for x in list(filter(lambda x:
|
46 |
]
|
47 |
|
48 |
# Load metadata
|
@@ -59,19 +120,21 @@ if __name__ == "__main__":
|
|
59 |
|
60 |
demo = gr.Interface(
|
61 |
fn=run_inference,
|
62 |
-
title="
|
63 |
inputs=[
|
64 |
-
gr.Dropdown(algos, label="Algorithm version", value="
|
65 |
-
gr.
|
66 |
-
label="
|
67 |
-
placeholder="CC(C#C)N(C)C(=O)NC1=CC=C(Cl)C=C1",
|
68 |
-
lines=1,
|
69 |
),
|
70 |
-
gr.Slider(
|
|
|
|
|
|
|
|
|
|
|
71 |
gr.Slider(
|
72 |
-
minimum=1, maximum=50, value=
|
73 |
),
|
74 |
-
gr.Number(value=42, label="Seed", precision=0),
|
75 |
],
|
76 |
outputs=gr.HTML(label="Output"),
|
77 |
article=article,
|
|
|
3 |
|
4 |
import gradio as gr
|
5 |
import pandas as pd
|
6 |
+
from gt4sd.algorithms.conditional_generation.guacamol import (
|
7 |
+
AaeGenerator,
|
8 |
+
GraphGAGenerator,
|
9 |
+
GraphMCTSGenerator,
|
10 |
+
GuacaMolGenerator,
|
11 |
+
MosesGenerator,
|
12 |
+
OrganGenerator,
|
13 |
+
VaeGenerator,
|
14 |
+
SMILESGAGenerator,
|
15 |
+
SMILESLSTMHCGenerator,
|
16 |
+
SMILESLSTMPPOGenerator,
|
17 |
+
)
|
18 |
from gt4sd.algorithms.registry import ApplicationsRegistry
|
19 |
+
|
20 |
from utils import draw_grid_generate
|
21 |
|
22 |
logger = logging.getLogger(__name__)
|
23 |
logger.addHandler(logging.NullHandler())
|
24 |
|
25 |
+
TITLE = "GuacaMol & MOSES"
|
26 |
+
|
27 |
+
CONFIG_FACTORY = {
|
28 |
+
"Moses - AaeGenerator": AaeGenerator,
|
29 |
+
"Moses - VaeGenerator": VaeGenerator,
|
30 |
+
"Moses - OrganGenerator": OrganGenerator,
|
31 |
+
"GuacaMol - GraphGAGenerator": GraphGAGenerator,
|
32 |
+
"GuacaMol - GraphMCTSGenerator": GraphMCTSGenerator,
|
33 |
+
"GuacaMol - SMILESLSTMHCGenerator": SMILESLSTMHCGenerator,
|
34 |
+
"GuacaMol - SMILESLSTMPPOGenerator": SMILESLSTMPPOGenerator,
|
35 |
+
"GuacaMol - SMILESGAGenerator": SMILESGAGenerator,
|
36 |
+
}
|
37 |
+
# OVERWRITE
|
38 |
+
CONFIG_FACTORY = {
|
39 |
+
"AaeGenerator": AaeGenerator,
|
40 |
+
"VaeGenerator": VaeGenerator,
|
41 |
+
"OrganGenerator": OrganGenerator,
|
42 |
+
}
|
43 |
+
MODEL_FACTORY = {"Moses": MosesGenerator, "GuacaMol": GuacaMolGenerator}
|
44 |
|
45 |
|
46 |
def run_inference(
|
47 |
algorithm_version: str,
|
48 |
+
length: int,
|
49 |
+
# population_size: int,
|
50 |
+
# random_start: bool,
|
51 |
+
# patience: int,
|
52 |
+
# generations: int,
|
53 |
number_of_samples: int,
|
|
|
54 |
):
|
55 |
+
config_class = CONFIG_FACTORY[algorithm_version]
|
56 |
+
# family = algorithm_version.split(" - ")[0]
|
57 |
+
family = "Moses"
|
58 |
+
model_class = MODEL_FACTORY[family]
|
59 |
+
|
60 |
+
if family == "Moses":
|
61 |
+
kwargs = {"n_samples": number_of_samples, "max_len": length}
|
62 |
+
elif family == "GuacaMol":
|
63 |
+
kwargs = {
|
64 |
+
"population_size": population_size,
|
65 |
+
"random_start": random_start,
|
66 |
+
"patience": patience,
|
67 |
+
"generations": generations,
|
68 |
+
}
|
69 |
+
if "MCTS" in algorithm_version:
|
70 |
+
kwargs.pop("random_start")
|
71 |
+
if "LSTMHC" in algorithm_version:
|
72 |
+
kwargs["max_len"] = length
|
73 |
+
kwargs.pop("population_size")
|
74 |
+
kwargs.pop("patience")
|
75 |
+
kwargs.pop("generations")
|
76 |
+
if "LSTMPPO" in algorithm_version:
|
77 |
+
kwargs = {}
|
78 |
+
else:
|
79 |
+
raise ValueError(f"Unknown family {family}")
|
80 |
+
|
81 |
+
config = config_class(**kwargs)
|
82 |
+
|
83 |
+
model = model_class(configuration=config, target={})
|
84 |
samples = list(model.sample(number_of_samples))
|
85 |
|
86 |
+
return draw_grid_generate(seeds=[], samples=samples, n_cols=5)
|
|
|
87 |
|
88 |
|
89 |
if __name__ == "__main__":
|
90 |
|
91 |
# Preparation (retrieve all available algorithms)
|
92 |
all_algos = ApplicationsRegistry.list_available()
|
93 |
+
guacamol_algos = [
|
94 |
+
"GuacaMol - " + x["algorithm_application"]
|
95 |
+
for x in list(filter(lambda x: "GuacaMol" in x["algorithm_name"], all_algos))
|
96 |
+
]
|
97 |
+
moses_algos = [
|
98 |
+
"Moses - " + x["algorithm_application"]
|
99 |
+
for x in list(filter(lambda x: "Moses" in x["algorithm_name"], all_algos))
|
100 |
+
]
|
101 |
+
algos = guacamol_algos + moses_algos
|
102 |
+
|
103 |
+
# Overwrite to have only Moses
|
104 |
algos = [
|
105 |
+
x["algorithm_application"]
|
106 |
+
for x in list(filter(lambda x: "Moses" in x["algorithm_name"], all_algos))
|
107 |
]
|
108 |
|
109 |
# Load metadata
|
|
|
120 |
|
121 |
demo = gr.Interface(
|
122 |
fn=run_inference,
|
123 |
+
title="MOSES",
|
124 |
inputs=[
|
125 |
+
gr.Dropdown(algos, label="Algorithm version", value="AaeGenerator"),
|
126 |
+
gr.Slider(
|
127 |
+
minimum=5, maximum=500, value=100, label="Sequence length", step=1
|
|
|
|
|
128 |
),
|
129 |
+
# gr.Slider(
|
130 |
+
# minimum=5, maximum=500, value=100, label="Population size", step=1
|
131 |
+
# ),
|
132 |
+
# gr.Radio(choices=[True, False], label="Random start", value=False),
|
133 |
+
# gr.Slider(minimum=1, maximum=10, value=4, label="Patience"),
|
134 |
+
# gr.Slider(minimum=1, maximum=10, value=2, label="Generations"),
|
135 |
gr.Slider(
|
136 |
+
minimum=1, maximum=50, value=5, label="Number of samples", step=1
|
137 |
),
|
|
|
138 |
],
|
139 |
outputs=gr.HTML(label="Output"),
|
140 |
article=article,
|
model_cards/article.md
CHANGED
@@ -1,37 +1,39 @@
|
|
1 |
# Model documentation & parameters
|
2 |
|
3 |
-
**Algorithm Version**: Which
|
4 |
|
5 |
-
**
|
6 |
|
7 |
**Number of samples**: How many samples should be generated (between 1 and 50).
|
8 |
|
9 |
-
**Beam size**: Beam size used in beam search decoding (the higher the slower but better).
|
10 |
|
11 |
-
**Seed**: The random seed used for initialization.
|
12 |
|
|
|
13 |
|
14 |
-
|
15 |
|
16 |
-
**
|
17 |
-
|
18 |
-
**Developers**: Krzysztof Maziarz and co-authors from Microsoft Research and Novartis (full reference at bottom).
|
19 |
|
20 |
**Distributors**: Developer's code wrapped and distributed by GT4SD Team (2023) from IBM Research.
|
21 |
|
22 |
-
**Model date**: Released
|
23 |
|
24 |
-
**Model version**: Model provided by original authors,
|
25 |
|
26 |
-
**Model type**:
|
|
|
|
|
|
|
|
|
27 |
|
28 |
-
**Information about training algorithms, parameters, fairness constraints or other applied approaches, and features**: Trained by the original authors with the default parameters provided [
|
29 |
|
30 |
-
**Paper or other resource for more information**: [
|
|
|
31 |
|
32 |
**License**: MIT
|
33 |
|
34 |
-
**Where to send questions or comments about the model**: Open an issue on
|
35 |
|
36 |
**Intended Use. Use cases that were envisioned during development**: Chemical research, in particular drug discovery.
|
37 |
|
@@ -43,7 +45,7 @@
|
|
43 |
|
44 |
**Metrics**: Validation loss on decoding correct molecules. Evaluated on several downstream tasks.
|
45 |
|
46 |
-
**Datasets**:
|
47 |
|
48 |
**Ethical Considerations**: Unclear, please consult with original authors in case of questions.
|
49 |
|
@@ -54,12 +56,14 @@ Model card prototype inspired by [Mitchell et al. (2019)](https://dl.acm.org/doi
|
|
54 |
## Citation
|
55 |
|
56 |
```bib
|
57 |
-
@
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
|
|
|
|
63 |
}
|
64 |
```
|
65 |
|
|
|
1 |
# Model documentation & parameters
|
2 |
|
3 |
+
**Algorithm Version**: Which algorithm to use (VAE, AAE or ORGAN).
|
4 |
|
5 |
+
**Sequence length**: Maximal length of the sequence.
|
6 |
|
7 |
**Number of samples**: How many samples should be generated (between 1 and 50).
|
8 |
|
|
|
9 |
|
|
|
10 |
|
11 |
+
# Model card - MOSES
|
12 |
|
13 |
+
**Model Details**: MOSES is a benchmarking platform for molecular generation models.
|
14 |
|
15 |
+
**Developers**: Daniil Polykovskiy and colleagues from *Insilico Medicine*.
|
|
|
|
|
16 |
|
17 |
**Distributors**: Developer's code wrapped and distributed by GT4SD Team (2023) from IBM Research.
|
18 |
|
19 |
+
**Model date**: Released in 2020.
|
20 |
|
21 |
+
**Model version**: Model provided by original authors, adapted by GT4SD authors in [this GitHub repo](https://github.com/GT4SD/moses).
|
22 |
|
23 |
+
**Model type**: Three types of models:
|
24 |
+
1. **VAE**: VAEs [[1](https://pubs.acs.org/doi/full/10.1021/acscentsci.7b00572),[2](https://pubs.acs.org/doi/full/10.1021/acscentsci.7b00572),[3],(https://arxiv.org/abs/1711.07839)] framework for training two neural networks—an encoder and a decoder—to learn a mapping from high-dimensional data representation into a lower-dimensional space and back. The lower-dimensional space is called the latent space, which is often a continuous vector space with normally distributed latent representation. VAE parameters are optimized to encode and decode data by minimizing the reconstruction loss while also minimizing a KL-divergence term arising from the variational approximation that can loosely be interpreted as a regularization term. Since molecules are discrete objects, properly trained VAE defines an invertible continuous representation of a molecule. We combine aspects from both implementations in MOSES. Utilizing a bidirectional77 Gated Recurrent Unit (GRU) with a linear output layer as an encoder. The decoder is a 3-layer GRU RNN of 512 hidden dimensions with intermediate dropout layers with dropout probability 0.2. Training is done with a batch size of 128, utilizing a gradient clipping of 50, KL-term weight of 1, and optimized with Adam with a learning rate of 0.0003 for 50 epochs.
|
25 |
+
2. **AAE**: AAE [[1](https://arxiv.org/abs/1511.05644)] combine the idea of VAE with that of adversarial training as found in GAN. One of the main drawbacks of VAE is the KL divergence term that has a closed-form analytical solution only for a handful of distributions. In AAE, the KL divergence term is avoided by training a discriminator network to predict whether a given sample came from the latent space of the AE or from a prior distribution. Parameters are optimized to minimize the reconstruction loss and to minimize the discriminator loss. [Kadurin et al.](https://arxiv.org/abs/1511.05644) applied AAE architecture to the drug generation task. The model consists of an encoder with a 1-layer bidirectional LSTM with 380 hidden dimensions, a decoder with a 2-layer LSTM with 640 hidden dimensions and a shared embedding of size 32. The latent space is of dimension 640, and the discriminator networks is a 2-layer fully connected neural network with 640 and 256 nodes respectively, utilizing the ELU activation function. Training is done with a batch size of 128, with the Adam optimizer using a learning rate of 0.001 for 25 epochs.
|
26 |
+
3. **ORGAN**: Objective-reinforced generative adversarial network (ORGAN) (see [[1](https://arxiv.org/abs/1705.10843),[2](https://arxiv.org/abs/1705.10843)]) is a sequence generation model based on adversarial training that aims at generating discrete sequences that emulate a data distribution while biasing the generation process towards some desired objective rewards using reinforcement learning. ORGAN incorporates at least 2 networks: a generator network and a discriminator network. The goal of the generator network is to create synthetic data examples that are indistinguishable from the empirical data distribution. The discriminator exists to learn to distinguish synthetic from real data samples. Both models are trained in alternation.
|
27 |
+
To properly train a GAN, the gradient must be backpropagated between the generator and discriminator model, which is not possible when the data samples come from a discrete distribution such as multinomial since discrete distributions are non-differentiable. SeqGAN10 proposed to learn a policy gradient that can be backpropagated and calculated using the REINFORCE81 algorithm. ORGAN extended this framework to include other reward functions besides the discriminator. Reinforcement is done with an N-depth Monte Carlo tree search, and the reward is a weighted sum of probabilities from the discriminator and objective reward. Both the generator and discriminator are pre-trained for 250 and 50 epochs respectively, and then jointly trained for 100 epochs utilizing the Adam optimizer with a learning rate of 0.0001. In the experiments, we used chemical validity and uniqueness as rewards.
|
28 |
|
29 |
+
**Information about training algorithms, parameters, fairness constraints or other applied approaches, and features**: Trained by the original authors with the default parameters provided [in their paper](https://www.frontiersin.org/articles/10.3389/fphar.2020.565644/full).
|
30 |
|
31 |
+
**Paper or other resource for more information**: [Molecular Sets (MOSES): A Benchmarking Platform for Molecular Generation Models
|
32 |
+
(2020; *Frontiers in Pharmacology*)](https://www.frontiersin.org/articles/10.3389/fphar.2020.565644/full).
|
33 |
|
34 |
**License**: MIT
|
35 |
|
36 |
+
**Where to send questions or comments about the model**: Open an issue on the [GT4SD repo](https://github.com/GT4SD/moses).
|
37 |
|
38 |
**Intended Use. Use cases that were envisioned during development**: Chemical research, in particular drug discovery.
|
39 |
|
|
|
45 |
|
46 |
**Metrics**: Validation loss on decoding correct molecules. Evaluated on several downstream tasks.
|
47 |
|
48 |
+
**Datasets**: 4M molecules from ZINC.
|
49 |
|
50 |
**Ethical Considerations**: Unclear, please consult with original authors in case of questions.
|
51 |
|
|
|
56 |
## Citation
|
57 |
|
58 |
```bib
|
59 |
+
@article{polykovskiy2020molecular,
|
60 |
+
title={Molecular sets (MOSES): a benchmarking platform for molecular generation models},
|
61 |
+
author={Polykovskiy, Daniil and Zhebrak, Alexander and Sanchez-Lengeling, Benjamin and Golovanov, Sergey and Tatanov, Oktai and Belyaev, Stanislav and Kurbanov, Rauf and Artamonov, Aleksey and Aladinskiy, Vladimir and Veselov, Mark and others},
|
62 |
+
journal={Frontiers in pharmacology},
|
63 |
+
volume={11},
|
64 |
+
pages={565644},
|
65 |
+
year={2020},
|
66 |
+
publisher={Frontiers Media SA}
|
67 |
}
|
68 |
```
|
69 |
|
model_cards/depr_description.md
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<img align="right" src="https://raw.githubusercontent.com/GT4SD/gt4sd-core/main/docs/_static/gt4sd_logo.png" alt="logo" width="120" >
|
2 |
+
|
3 |
+
Miscellaneous models from the [GuacaMol](https://github.com/GT4SD/guacamol_baselines) and the [Moses](https://github.com/GT4SD/moses) benchmark. For details, please see the [GuacaMol paper (Brown et al., 2019, *J. Chem. Inf. Model.*)](https://pubs.acs.org/doi/full/10.1021/acs.jcim.8b00839) and the [Moses paper (Polykovskiy et al., 2020, *Front. Pharmacology*)](https://www.frontiersin.org/articles/10.3389/fphar.2020.565644/full).
|
4 |
+
|
5 |
+
**NOTE**: Only a subset of the features of the GuacaMol and Moses models are exposed through this UI. Most importantly, there is no support (yet) for property optimization. For details on this usage, please see the [GT4SD docs](https://gt4sd.github.io/gt4sd-core/api/gt4sd.algorithms.conditional_generation.guacamol.html).
|
6 |
+
|
7 |
+
For **examples** and **documentation** of the model parameters, please see below.
|
8 |
+
Moreover, we provide a **model card** ([Mitchell et al. (2019)](https://dl.acm.org/doi/abs/10.1145/3287560.3287596?casa_token=XD4eHiE2cRUAAAAA:NL11gMa1hGPOUKTAbtXnbVQBDBbjxwcjGECF_i-WC_3g1aBgU1Hbz_f2b4kI_m1in-w__1ztGeHnwHs)) at the bottom of this page.
|
model_cards/description.md
CHANGED
@@ -1,6 +1,8 @@
|
|
1 |
<img align="right" src="https://raw.githubusercontent.com/GT4SD/gt4sd-core/main/docs/_static/gt4sd_logo.png" alt="logo" width="120" >
|
2 |
|
3 |
-
|
|
|
|
|
4 |
|
5 |
For **examples** and **documentation** of the model parameters, please see below.
|
6 |
Moreover, we provide a **model card** ([Mitchell et al. (2019)](https://dl.acm.org/doi/abs/10.1145/3287560.3287596?casa_token=XD4eHiE2cRUAAAAA:NL11gMa1hGPOUKTAbtXnbVQBDBbjxwcjGECF_i-WC_3g1aBgU1Hbz_f2b4kI_m1in-w__1ztGeHnwHs)) at the bottom of this page.
|
|
|
1 |
<img align="right" src="https://raw.githubusercontent.com/GT4SD/gt4sd-core/main/docs/_static/gt4sd_logo.png" alt="logo" width="120" >
|
2 |
|
3 |
+
Miscellaneous models from the [Moses](https://github.com/GT4SD/moses) benchmark. For details, please see the [Moses paper (Polykovskiy et al., 2020, *Front. Pharmacology*)](https://www.frontiersin.org/articles/10.3389/fphar.2020.565644/full).
|
4 |
+
|
5 |
+
**NOTE**: Only a subset of the features of the Moses models are exposed through this UI. For details on this usage, please see the [GT4SD docs](https://gt4sd.github.io/gt4sd-core/api/gt4sd.algorithms.conditional_generation.guacamol.html).
|
6 |
|
7 |
For **examples** and **documentation** of the model parameters, please see below.
|
8 |
Moreover, we provide a **model card** ([Mitchell et al. (2019)](https://dl.acm.org/doi/abs/10.1145/3287560.3287596?casa_token=XD4eHiE2cRUAAAAA:NL11gMa1hGPOUKTAbtXnbVQBDBbjxwcjGECF_i-WC_3g1aBgU1Hbz_f2b4kI_m1in-w__1ztGeHnwHs)) at the bottom of this page.
|