moses

Build error

App Files Files Community

jannisborn commited on Jan 15, 2023

Commit

8ae3405

unverified ·

1 Parent(s): 34ae1d8

update

Browse files

Files changed (5) hide show

README.md +1 -1
app.py +91 -28
model_cards/article.md +25 -21
model_cards/depr_description.md +8 -0
model_cards/description.md +3 -1

README.md CHANGED Viewed

@@ -1,5 +1,5 @@
 ---
-title: GT4SD - MoLeR
 emoji: 💡
 colorFrom: green
 colorTo: blue

 ---
+title: GT4SD - MOSES
 emoji: 💡
 colorFrom: green
 colorTo: blue

app.py CHANGED Viewed

@@ -3,46 +3,107 @@ import pathlib
 import gradio as gr
 import pandas as pd
-from gt4sd.algorithms.generation.moler import MoLeR, MoLeRDefaultGenerator
 from gt4sd.algorithms.registry import ApplicationsRegistry
 from utils import draw_grid_generate
 logger = logging.getLogger(__name__)
 logger.addHandler(logging.NullHandler())
-TITLE = "MoLeR"
 def run_inference(
     algorithm_version: str,
-    scaffolds: str,
-    beam_size: int,
     number_of_samples: int,
-    seed: int,
 ):
-    config = MoLeRDefaultGenerator(
-        algorithm_version=algorithm_version,
-        scaffolds=scaffolds,
-        beam_size=beam_size,
-        num_samples=4,
-        seed=seed,
-        num_workers=1,
-    )
-    model = MoLeR(configuration=config)
     samples = list(model.sample(number_of_samples))
-    seed_mols = [] if scaffolds == "" else scaffolds.split(".")
-    return draw_grid_generate(seed_mols, samples)
 if __name__ == "__main__":
     # Preparation (retrieve all available algorithms)
     all_algos = ApplicationsRegistry.list_available()
     algos = [
-        x["algorithm_version"]
-        for x in list(filter(lambda x: TITLE in x["algorithm_name"], all_algos))
     ]
     # Load metadata
@@ -59,19 +120,21 @@ if __name__ == "__main__":
     demo = gr.Interface(
         fn=run_inference,
-        title="MoLeR (MOlecule-LEvel Representation)",
         inputs=[
-            gr.Dropdown(algos, label="Algorithm version", value="v0"),
-            gr.Textbox(
-                label="Scaffolds",
-                placeholder="CC(C#C)N(C)C(=O)NC1=CC=C(Cl)C=C1",
-                lines=1,
             ),
-            gr.Slider(minimum=1, maximum=5, value=1, step=1, label="Beam_size"),
             gr.Slider(
-                minimum=1, maximum=50, value=10, label="Number of samples", step=1
             ),
-            gr.Number(value=42, label="Seed", precision=0),
         ],
         outputs=gr.HTML(label="Output"),
         article=article,

 import gradio as gr
 import pandas as pd
+from gt4sd.algorithms.conditional_generation.guacamol import (
+    AaeGenerator,
+    GraphGAGenerator,
+    GraphMCTSGenerator,
+    GuacaMolGenerator,
+    MosesGenerator,
+    OrganGenerator,
+    VaeGenerator,
+    SMILESGAGenerator,
+    SMILESLSTMHCGenerator,
+    SMILESLSTMPPOGenerator,
+)
 from gt4sd.algorithms.registry import ApplicationsRegistry
 from utils import draw_grid_generate
 logger = logging.getLogger(__name__)
 logger.addHandler(logging.NullHandler())
+TITLE = "GuacaMol & MOSES"
+CONFIG_FACTORY = {
+    "Moses - AaeGenerator": AaeGenerator,
+    "Moses - VaeGenerator": VaeGenerator,
+    "Moses - OrganGenerator": OrganGenerator,
+    "GuacaMol - GraphGAGenerator": GraphGAGenerator,
+    "GuacaMol - GraphMCTSGenerator": GraphMCTSGenerator,
+    "GuacaMol - SMILESLSTMHCGenerator": SMILESLSTMHCGenerator,
+    "GuacaMol - SMILESLSTMPPOGenerator": SMILESLSTMPPOGenerator,
+    "GuacaMol - SMILESGAGenerator": SMILESGAGenerator,
+}
+# OVERWRITE
+CONFIG_FACTORY = {
+    "AaeGenerator": AaeGenerator,
+    "VaeGenerator": VaeGenerator,
+    "OrganGenerator": OrganGenerator,
+}
+MODEL_FACTORY = {"Moses": MosesGenerator, "GuacaMol": GuacaMolGenerator}
 def run_inference(
     algorithm_version: str,
+    length: int,
+    # population_size: int,
+    # random_start: bool,
+    # patience: int,
+    # generations: int,
     number_of_samples: int,
 ):
+    config_class = CONFIG_FACTORY[algorithm_version]
+    # family = algorithm_version.split(" - ")[0]
+    family = "Moses"
+    model_class = MODEL_FACTORY[family]
+    if family == "Moses":
+        kwargs = {"n_samples": number_of_samples, "max_len": length}
+    elif family == "GuacaMol":
+        kwargs = {
+            "population_size": population_size,
+            "random_start": random_start,
+            "patience": patience,
+            "generations": generations,
+        }
+        if "MCTS" in algorithm_version:
+            kwargs.pop("random_start")
+        if "LSTMHC" in algorithm_version:
+            kwargs["max_len"] = length
+            kwargs.pop("population_size")
+            kwargs.pop("patience")
+            kwargs.pop("generations")
+        if "LSTMPPO" in algorithm_version:
+            kwargs = {}
+    else:
+        raise ValueError(f"Unknown family {family}")
+    config = config_class(**kwargs)
+    model = model_class(configuration=config, target={})
     samples = list(model.sample(number_of_samples))
+    return draw_grid_generate(seeds=[], samples=samples, n_cols=5)
 if __name__ == "__main__":
     # Preparation (retrieve all available algorithms)
     all_algos = ApplicationsRegistry.list_available()
+    guacamol_algos = [
+        "GuacaMol - " + x["algorithm_application"]
+        for x in list(filter(lambda x: "GuacaMol" in x["algorithm_name"], all_algos))
+    ]
+    moses_algos = [
+        "Moses - " + x["algorithm_application"]
+        for x in list(filter(lambda x: "Moses" in x["algorithm_name"], all_algos))
+    ]
+    algos = guacamol_algos + moses_algos
+    # Overwrite to have only Moses
     algos = [
+        x["algorithm_application"]
+        for x in list(filter(lambda x: "Moses" in x["algorithm_name"], all_algos))
     ]
     # Load metadata
     demo = gr.Interface(
         fn=run_inference,
+        title="MOSES",
         inputs=[
+            gr.Dropdown(algos, label="Algorithm version", value="AaeGenerator"),
+            gr.Slider(
+                minimum=5, maximum=500, value=100, label="Sequence length", step=1
             ),
+            # gr.Slider(
+            #     minimum=5, maximum=500, value=100, label="Population size", step=1
+            # ),
+            # gr.Radio(choices=[True, False], label="Random start", value=False),
+            # gr.Slider(minimum=1, maximum=10, value=4, label="Patience"),
+            # gr.Slider(minimum=1, maximum=10, value=2, label="Generations"),
             gr.Slider(
+                minimum=1, maximum=50, value=5, label="Number of samples", step=1
             ),
         ],
         outputs=gr.HTML(label="Output"),
         article=article,

model_cards/article.md CHANGED Viewed

@@ -1,37 +1,39 @@
 # Model documentation & parameters
-**Algorithm Version**: Which model checkpoint to use (trained on different datasets).
-**Scaffolds**: One or multiple scaffolds (or seed molecules), provided as '.'-separated SMILES. If empty, no scaffolds are used.
 **Number of samples**: How many samples should be generated (between 1 and 50).
-**Beam size**: Beam size used in beam search decoding (the higher the slower but better).
-**Seed**: The random seed used for initialization.
-# Model card
-**Model Details**: MoLeR is a graph-based molecular generative model that can be conditioned (primed) on scaffolds. The model decorates scaffolds with realistic structural motifs.
-**Developers**: Krzysztof Maziarz and co-authors from Microsoft Research and Novartis (full reference at bottom).
 **Distributors**: Developer's code wrapped and distributed by GT4SD Team (2023) from IBM Research.
-**Model date**: Released around March 2022.
-**Model version**: Model provided by original authors, see [their GitHub repo](https://github.com/microsoft/molecule-generation).
-**Model type**: An encoder-decoder-based GNN for molecular generation.
-**Information about training algorithms, parameters, fairness constraints or other applied approaches, and features**: Trained by the original authors with the default parameters provided [on GitHub](https://github.com/microsoft/molecule-generation).
-**Paper or other resource for more information**: [Learning to Extend Molecular Scaffolds with Structural Motifs (ICLR 2022)](https://openreview.net/forum?id=ZTsoE8G3GG).
 **License**: MIT
-**Where to send questions or comments about the model**: Open an issue on original author's [GitHub repository](https://github.com/microsoft/molecule-generation).
 **Intended Use. Use cases that were envisioned during development**: Chemical research, in particular drug discovery.
@@ -43,7 +45,7 @@
 **Metrics**: Validation loss on decoding correct molecules. Evaluated on several downstream tasks.
-**Datasets**: 1.5M drug-like molecules from GuacaMol benchmark. Finetuning on 20 molecular optimization tasks from GuacaMol.
 **Ethical Considerations**: Unclear, please consult with original authors in case of questions.
@@ -54,12 +56,14 @@ Model card prototype inspired by [Mitchell et al. (2019)](https://dl.acm.org/doi
 ## Citation
 ```bib
-@inproceedings{maziarz2021learning,
-  author={Krzysztof Maziarz and Henry Richard Jackson{-}Flux and Pashmina Cameron and
-    Finton Sirockin and Nadine Schneider and Nikolaus Stiefl and Marwin H. S. Segler and Marc Brockschmidt},
-  title     = {Learning to Extend Molecular Scaffolds with Structural Motifs},
-  booktitle = {The Tenth International Conference on Learning Representations, {ICLR}},
-  year      = {2022}
 }
 ```

 # Model documentation & parameters
+**Algorithm Version**: Which algorithm to use (VAE, AAE or ORGAN).
+**Sequence length**: Maximal length of the sequence.
 **Number of samples**: How many samples should be generated (between 1 and 50).
+# Model card - MOSES
+**Model Details**: MOSES is a benchmarking platform for molecular generation models.
+**Developers**: Daniil Polykovskiy and colleagues from *Insilico Medicine*.
 **Distributors**: Developer's code wrapped and distributed by GT4SD Team (2023) from IBM Research.
+**Model date**: Released in 2020.
+**Model version**: Model provided by original authors, adapted by GT4SD authors in [this GitHub repo](https://github.com/GT4SD/moses).
+**Model type**: Three types of models:
+1. **VAE**: VAEs [[1](https://pubs.acs.org/doi/full/10.1021/acscentsci.7b00572),[2](https://pubs.acs.org/doi/full/10.1021/acscentsci.7b00572),[3],(https://arxiv.org/abs/1711.07839)] framework for training two neural networks—an encoder and a decoder—to learn a mapping from high-dimensional data representation into a lower-dimensional space and back. The lower-dimensional space is called the latent space, which is often a continuous vector space with normally distributed latent representation. VAE parameters are optimized to encode and decode data by minimizing the reconstruction loss while also minimizing a KL-divergence term arising from the variational approximation that can loosely be interpreted as a regularization term. Since molecules are discrete objects, properly trained VAE defines an invertible continuous representation of a molecule. We combine aspects from both implementations in MOSES. Utilizing a bidirectional77 Gated Recurrent Unit (GRU) with a linear output layer as an encoder. The decoder is a 3-layer GRU RNN of 512 hidden dimensions with intermediate dropout layers with dropout probability 0.2. Training is done with a batch size of 128, utilizing a gradient clipping of 50, KL-term weight of 1, and optimized with Adam with a learning rate of 0.0003 for 50 epochs.
+2. **AAE**: AAE [[1](https://arxiv.org/abs/1511.05644)] combine the idea of VAE with that of adversarial training as found in GAN. One of the main drawbacks of VAE is the KL divergence term that has a closed-form analytical solution only for a handful of distributions. In AAE, the KL divergence term is avoided by training a discriminator network to predict whether a given sample came from the latent space of the AE or from a prior distribution. Parameters are optimized to minimize the reconstruction loss and to minimize the discriminator loss. [Kadurin et al.](https://arxiv.org/abs/1511.05644) applied AAE architecture to the drug generation task. The model consists of an encoder with a 1-layer bidirectional LSTM with 380 hidden dimensions, a decoder with a 2-layer LSTM with 640 hidden dimensions and a shared embedding of size 32. The latent space is of dimension 640, and the discriminator networks is a 2-layer fully connected neural network with 640 and 256 nodes respectively, utilizing the ELU activation function. Training is done with a batch size of 128, with the Adam optimizer using a learning rate of 0.001 for 25 epochs.
+3. **ORGAN**: Objective-reinforced generative adversarial network (ORGAN) (see [[1](https://arxiv.org/abs/1705.10843),[2](https://arxiv.org/abs/1705.10843)]) is a sequence generation model based on adversarial training that aims at generating discrete sequences that emulate a data distribution while biasing the generation process towards some desired objective rewards using reinforcement learning. ORGAN incorporates at least 2 networks: a generator network and a discriminator network. The goal of the generator network is to create synthetic data examples that are indistinguishable from the empirical data distribution. The discriminator exists to learn to distinguish synthetic from real data samples. Both models are trained in alternation.
+To properly train a GAN, the gradient must be backpropagated between the generator and discriminator model, which is not possible when the data samples come from a discrete distribution such as multinomial since discrete distributions are non-differentiable. SeqGAN10 proposed to learn a policy gradient that can be backpropagated and calculated using the REINFORCE81 algorithm. ORGAN extended this framework to include other reward functions besides the discriminator. Reinforcement is done with an N-depth Monte Carlo tree search, and the reward is a weighted sum of probabilities from the discriminator and objective reward. Both the generator and discriminator are pre-trained for 250 and 50 epochs respectively, and then jointly trained for 100 epochs utilizing the Adam optimizer with a learning rate of 0.0001. In the experiments, we used chemical validity and uniqueness as rewards.
+**Information about training algorithms, parameters, fairness constraints or other applied approaches, and features**: Trained by the original authors with the default parameters provided [in their paper](https://www.frontiersin.org/articles/10.3389/fphar.2020.565644/full).
+**Paper or other resource for more information**: [Molecular Sets (MOSES): A Benchmarking Platform for Molecular Generation Models
+(2020; *Frontiers in Pharmacology*)](https://www.frontiersin.org/articles/10.3389/fphar.2020.565644/full).
 **License**: MIT
+**Where to send questions or comments about the model**: Open an issue on the [GT4SD repo](https://github.com/GT4SD/moses).
 **Intended Use. Use cases that were envisioned during development**: Chemical research, in particular drug discovery.
 **Metrics**: Validation loss on decoding correct molecules. Evaluated on several downstream tasks.
+**Datasets**: 4M molecules from ZINC.
 **Ethical Considerations**: Unclear, please consult with original authors in case of questions.
 ## Citation
 ```bib
+@article{polykovskiy2020molecular,
+  title={Molecular sets (MOSES): a benchmarking platform for molecular generation models},
+  author={Polykovskiy, Daniil and Zhebrak, Alexander and Sanchez-Lengeling, Benjamin and Golovanov, Sergey and Tatanov, Oktai and Belyaev, Stanislav and Kurbanov, Rauf and Artamonov, Aleksey and Aladinskiy, Vladimir and Veselov, Mark and others},
+  journal={Frontiers in pharmacology},
+  volume={11},
+  pages={565644},
+  year={2020},
+  publisher={Frontiers Media SA}
 }
 ```

model_cards/depr_description.md ADDED Viewed

	@@ -0,0 +1,8 @@

+<img align="right" src="https://raw.githubusercontent.com/GT4SD/gt4sd-core/main/docs/_static/gt4sd_logo.png" alt="logo" width="120" >
+Miscellaneous models from the [GuacaMol](https://github.com/GT4SD/guacamol_baselines) and the [Moses](https://github.com/GT4SD/moses) benchmark. For details, please see the [GuacaMol paper (Brown et al., 2019, *J. Chem. Inf. Model.*)](https://pubs.acs.org/doi/full/10.1021/acs.jcim.8b00839) and the [Moses paper (Polykovskiy et al., 2020, *Front. Pharmacology*)](https://www.frontiersin.org/articles/10.3389/fphar.2020.565644/full).
+**NOTE**: Only a subset of the features of the GuacaMol and Moses models are exposed through this UI. Most importantly, there is no support (yet) for property optimization. For details on this usage, please see the [GT4SD docs](https://gt4sd.github.io/gt4sd-core/api/gt4sd.algorithms.conditional_generation.guacamol.html).
+For **examples** and **documentation** of the model parameters, please see below.
+Moreover, we provide a **model card** ([Mitchell et al. (2019)](https://dl.acm.org/doi/abs/10.1145/3287560.3287596?casa_token=XD4eHiE2cRUAAAAA:NL11gMa1hGPOUKTAbtXnbVQBDBbjxwcjGECF_i-WC_3g1aBgU1Hbz_f2b4kI_m1in-w__1ztGeHnwHs)) at the bottom of this page.

model_cards/description.md CHANGED Viewed

@@ -1,6 +1,8 @@
 <img align="right" src="https://raw.githubusercontent.com/GT4SD/gt4sd-core/main/docs/_static/gt4sd_logo.png" alt="logo" width="120" >
-MoLeR ([Maziarz et al., (2022), *ICLR*](https://openreview.net/forum?id=ZTsoE8G3GG)) is a graph-based molecular generative model that can be conditioned (primed) on scaffolds. This model is provided and distributed by the **GT4SD** (Generative Toolkit for Scientific Discovery).
 For **examples** and **documentation** of the model parameters, please see below.
 Moreover, we provide a **model card** ([Mitchell et al. (2019)](https://dl.acm.org/doi/abs/10.1145/3287560.3287596?casa_token=XD4eHiE2cRUAAAAA:NL11gMa1hGPOUKTAbtXnbVQBDBbjxwcjGECF_i-WC_3g1aBgU1Hbz_f2b4kI_m1in-w__1ztGeHnwHs)) at the bottom of this page.

 <img align="right" src="https://raw.githubusercontent.com/GT4SD/gt4sd-core/main/docs/_static/gt4sd_logo.png" alt="logo" width="120" >
+Miscellaneous models from the [Moses](https://github.com/GT4SD/moses) benchmark. For details, please see the [Moses paper (Polykovskiy et al., 2020, *Front. Pharmacology*)](https://www.frontiersin.org/articles/10.3389/fphar.2020.565644/full).
+**NOTE**: Only a subset of the features of the Moses models are exposed through this UI. For details on this usage, please see the [GT4SD docs](https://gt4sd.github.io/gt4sd-core/api/gt4sd.algorithms.conditional_generation.guacamol.html).
 For **examples** and **documentation** of the model parameters, please see below.
 Moreover, we provide a **model card** ([Mitchell et al. (2019)](https://dl.acm.org/doi/abs/10.1145/3287560.3287596?casa_token=XD4eHiE2cRUAAAAA:NL11gMa1hGPOUKTAbtXnbVQBDBbjxwcjGECF_i-WC_3g1aBgU1Hbz_f2b4kI_m1in-w__1ztGeHnwHs)) at the bottom of this page.