hf-transformers

Running

App Files Files Community

jannisborn commited on Jan 8, 2023

Commit

9d0d0bd

unverified ·

1 Parent(s): 1298030

update

Browse files

Files changed (4) hide show

README.md +1 -1
app.py +57 -45
model_cards/article.md +24 -19
model_cards/description.md +1 -1

README.md CHANGED Viewed

@@ -1,5 +1,5 @@
 ---
-title: GT4SD - Advanced Manufacturing
 emoji: 💡
 colorFrom: green
 colorTo: blue

 ---
+title: GT4SD - HuggingFace text generators
 emoji: 💡
 colorFrom: green
 colorTo: blue

app.py CHANGED Viewed

@@ -2,40 +2,61 @@ import logging
 import pathlib
 import gradio as gr
 import pandas as pd
-from gt4sd.algorithms.controlled_sampling.advanced_manufacturing import (
-    CatalystGenerator,
-    AdvancedManufacturing,
 )
 from gt4sd.algorithms.registry import ApplicationsRegistry
-from utils import draw_grid_generate
 logger = logging.getLogger(__name__)
 logger.addHandler(logging.NullHandler())
 def run_inference(
-    algorithm_version: str,
-    target_binding_energy: float,
-    primer_smiles: str,
     length: float,
-    number_of_points: int,
-    number_of_steps: int,
-    number_of_samples: int,
 ):
-    config = CatalystGenerator(
-        algorithm_version=algorithm_version,
-        number_of_points=number_of_points,
-        number_of_steps=number_of_steps,
-        generated_length=length,
-        primer_smiles=primer_smiles,
     )
-    model = AdvancedManufacturing(config, target=target_binding_energy)
-    samples = list(model.sample(number_of_samples))
-    seeds = [] if primer_smiles == "" else [primer_smiles]
-    return draw_grid_generate(samples=samples, n_cols=5, seeds=seeds)
 if __name__ == "__main__":
@@ -43,10 +64,8 @@ if __name__ == "__main__":
     # Preparation (retrieve all available algorithms)
     all_algos = ApplicationsRegistry.list_available()
     algos = [
-        x["algorithm_version"]
-        for x in list(
-            filter(lambda x: "AdvancedManufact" in x["algorithm_name"], all_algos)
-        )
     ]
     # Load metadata
@@ -64,39 +83,32 @@ if __name__ == "__main__":
     demo = gr.Interface(
         fn=run_inference,
-        title="Advanced Manufacturing",
         inputs=[
             gr.Dropdown(
                 algos,
-                label="Algorithm version",
-                value="v0",
             ),
-            gr.Slider(minimum=1, maximum=100, value=10, label="Target binding energy"),
             gr.Textbox(
-                label="Primer SMILES",
-                placeholder="FP(F)F.CP(C)c1ccccc1.[Au]",
                 lines=1,
             ),
             gr.Slider(
-                minimum=5,
-                maximum=400,
-                value=100,
-                label="Maximal sequence length",
-                step=1,
-            ),
-            gr.Slider(
-                minimum=16, maximum=128, value=32, label="Number of points", step=1
             ),
-            gr.Slider(
-                minimum=16, maximum=128, value=50, label="Number of steps", step=1
-            ),
-            gr.Slider(
-                minimum=1, maximum=50, value=10, label="Number of samples", step=1
             ),
         ],
-        outputs=gr.HTML(label="Output"),
         article=article,
         description=description,
-        # examples=examples.values.tolist(),
     )
     demo.launch(debug=True, show_error=True)

 import pathlib
 import gradio as gr
 import pandas as pd
+from gt4sd.algorithms.generation.hugging_face import (
+    HuggingFaceCTRLGenerator,
+    HuggingFaceGenerationAlgorithm,
+    HuggingFaceGPT2Generator,
+    HuggingFaceTransfoXLGenerator,
+    HuggingFaceOpenAIGPTGenerator,
+    HuggingFaceXLMGenerator,
+    HuggingFaceXLNetGenerator,
 )
 from gt4sd.algorithms.registry import ApplicationsRegistry
 logger = logging.getLogger(__name__)
 logger.addHandler(logging.NullHandler())
+MODEL_FN = {
+    "HuggingFaceCTRLGenerator": HuggingFaceCTRLGenerator,
+    "HuggingFaceGPT2Generator": HuggingFaceGPT2Generator,
+    "HuggingFaceTransfoXLGenerator": HuggingFaceTransfoXLGenerator,
+    "HuggingFaceOpenAIGPTGenerator": HuggingFaceOpenAIGPTGenerator,
+    "HuggingFaceXLMGenerator": HuggingFaceXLMGenerator,
+    "HuggingFaceXLNetGenerator": HuggingFaceXLNetGenerator,
+}
 def run_inference(
+    model_type: str,
+    prompt: str,
     length: float,
+    temperature: float,
+    prefix: str,
+    k: float,
+    p: float,
+    repetition_penalty: float,
 ):
+    model = model_type.split("_")[0]
+    version = model_type.split("_")[1]
+    if model not in MODEL_FN.keys():
+        raise ValueError(f"Model type {model} not supported")
+    config = MODEL_FN[model](
+        algorithm_version=version,
+        prompt=prompt,
+        length=length,
+        temperature=temperature,
+        repetition_penalty=repetition_penalty,
+        k=k,
+        p=p,
+        prefix=prefix,
     )
+    model = HuggingFaceGenerationAlgorithm(config)
+    text = list(model.sample(1))[0]
+    return text
 if __name__ == "__main__":
     # Preparation (retrieve all available algorithms)
     all_algos = ApplicationsRegistry.list_available()
     algos = [
+        x["algorithm_application"] + "_" + x["algorithm_version"]
+        for x in list(filter(lambda x: "HuggingFace" in x["algorithm_name"], all_algos))
     ]
     # Load metadata
     demo = gr.Interface(
         fn=run_inference,
+        title="HuggingFace language models",
         inputs=[
             gr.Dropdown(
                 algos,
+                label="Language model",
+                value="HuggingFaceGPT2Generator",
             ),
             gr.Textbox(
+                label="Text prompt",
+                placeholder="I'm a stochastic parrot.",
                 lines=1,
             ),
+            gr.Slider(minimum=5, maximum=100, value=20, label="Maximal length", step=1),
             gr.Slider(
+                minimum=0.6, maximum=1.5, value=1.1, label="Decoding temperature"
             ),
+            gr.Textbox(
+                label="Prefix", placeholder="Some prefix (before the prompt)", lines=1
             ),
+            gr.Slider(minimum=2, maximum=500, value=50, label="Top-k", step=1),
+            gr.Slider(minimum=0.5, maximum=1, value=1.0, label="Decoding-p", step=1),
+            gr.Slider(minimum=0.5, maximum=5, value=1.0, label="Repetition penalty"),
         ],
+        outputs=gr.Textbox(label="Output"),
         article=article,
         description=description,
+        examples=examples.values.tolist(),
     )
     demo.launch(debug=True, show_error=True)

model_cards/article.md CHANGED Viewed

@@ -1,54 +1,60 @@
 # Model documentation & parameters
-**Algorithm Version**: Which model version to use.
-**Target binding energy**: The desired binding energy.
-**Primer SMILES**: A SMILES string used to prime the generation.
-**Maximal sequence length**: The maximal number of SMILES tokens in the generated molecule.
-**Number of points**: Number of points to sample with the Gaussian Process.
-**Number of steps**: Number of optimization steps in the Gaussian Process optimization.
-**Number of samples**: How many samples should be generated (between 1 and 50).
-# Model card -- AdvancedManufacturing
-**Model Details**: *AdvancedManufacturing* is a sequence-based molecular generator tuned to generate catalysts. The model relies on a recurrent Variational Autoencoder with a binding-energy predictor trained on the latent code. The framework uses Gaussian Processes for generating targeted molecules.
-**Developers**: Oliver Schilter and colleagues from IBM Research.
-**Distributors**: Original authors' code integrated into GT4SD.
-**Model date**: Not yet published.
-**Model version**: Different types of models trained on NCCR data using SMILES or SELFIES, potentially also with augmentation.
-**Model type**: A sequence-based molecular generator tuned to generate catalysts. The model relies on a recurrent Variational Autoencoder with a binding-energy predictor trained on the latent code. The framework uses Gaussian Processes for generating targeted molecules.
 **Information about training algorithms, parameters, fairness constraints or other applied approaches, and features**:
 N.A.
 **Paper or other resource for more information**:
-TBD
 **License**: MIT
 **Where to send questions or comments about the model**: Open an issue on [GT4SD repository](https://github.com/GT4SD/gt4sd-core).
-**Intended Use. Use cases that were envisioned during development**: Chemical research, in particular drug discovery.
-**Primary intended uses/users**: Researchers and computational chemists using the model for model comparison or research exploration purposes.
 **Out-of-scope use cases**: Production-level inference, producing molecules with harmful properties.
 **Metrics**: N.A.
-**Datasets**: Data provided through NCCR.
 **Ethical Considerations**: Unclear, please consult with original authors in case of questions.
@@ -57,7 +63,6 @@ TBD
 Model card prototype inspired by [Mitchell et al. (2019)](https://dl.acm.org/doi/abs/10.1145/3287560.3287596?casa_token=XD4eHiE2cRUAAAAA:NL11gMa1hGPOUKTAbtXnbVQBDBbjxwcjGECF_i-WC_3g1aBgU1Hbz_f2b4kI_m1in-w__1ztGeHnwHs)
 ## Citation
-TBD, temporarily please cite:
 ```bib
 @article{manica2022gt4sd,
   title={GT4SD: Generative Toolkit for Scientific Discovery},

 # Model documentation & parameters
+**Language model**: Type of language model to be used.
+**Text prompt**: The text prompt to condition the model.
+**Maximal length**: The maximal number of SMILES tokens in the generated molecule.
+**Decoding temperature**: The temperature in the beam search decoding.
+**Prefix**: A text prompt that will be passed to the mode **before** the prompt.
+**Top-k**: Number of top-k probability tokens to keep.
+**Decoding-p**: Only tokens with cumulative probabilities summing up to this value are kept.
+**Repetition penalty**: Penalty for repeating tokens. Leave unchanged, but for CTRL model, use 1.2.
+# Model card -- HuggingFace
+**Model Details**: Various Transformer-based language models.
+**Developers**: HuggingFace developers
+**Distributors**: HuggingFace developers' code integrated into GT4SD.
+**Model date**: Varies between models.
+**Model type**: Different types of `transformers` language models:
+- CTRL: `CTRLLMHeadModel`
+- GPT2: `GPT2LMHeadModel`
+- XLNet: `XLNetLMHeadModel`
+- OpenAIGPT: `OpenAIGPTLMHeadModel`
+- TransfoXL: `TransfoXLLMHeadModel`
+- XLM: `XLMWithLMHeadModel`
 **Information about training algorithms, parameters, fairness constraints or other applied approaches, and features**:
 N.A.
 **Paper or other resource for more information**:
+All documentation available from [transformers documentation](https://huggingface.co/docs/transformers/)
 **License**: MIT
 **Where to send questions or comments about the model**: Open an issue on [GT4SD repository](https://github.com/GT4SD/gt4sd-core).
+**Intended Use. Use cases that were envisioned during development**: N.A.
+**Primary intended uses/users**: N.A.
 **Out-of-scope use cases**: Production-level inference, producing molecules with harmful properties.
 **Metrics**: N.A.
+**Datasets**: N.A.
 **Ethical Considerations**: Unclear, please consult with original authors in case of questions.
 Model card prototype inspired by [Mitchell et al. (2019)](https://dl.acm.org/doi/abs/10.1145/3287560.3287596?casa_token=XD4eHiE2cRUAAAAA:NL11gMa1hGPOUKTAbtXnbVQBDBbjxwcjGECF_i-WC_3g1aBgU1Hbz_f2b4kI_m1in-w__1ztGeHnwHs)
 ## Citation
 ```bib
 @article{manica2022gt4sd,
   title={GT4SD: Generative Toolkit for Scientific Discovery},

model_cards/description.md CHANGED Viewed

@@ -1,6 +1,6 @@
 <img align="right" src="https://raw.githubusercontent.com/GT4SD/gt4sd-core/main/docs/_static/gt4sd_logo.png" alt="logo" width="120" >
-*AdvancedManufacturing* is a sequence-based molecular generator tuned to generate catalysts. The model relies on a Variational Autoencoder with a binding-energy predictor trained on the latent code. The framework uses Gaussian Processes for generating targeted molecules.
 For **examples** and **documentation** of the model parameters, please see below.
 Moreover, we provide a **model card** ([Mitchell et al. (2019)](https://dl.acm.org/doi/abs/10.1145/3287560.3287596?casa_token=XD4eHiE2cRUAAAAA:NL11gMa1hGPOUKTAbtXnbVQBDBbjxwcjGECF_i-WC_3g1aBgU1Hbz_f2b4kI_m1in-w__1ztGeHnwHs)) at the bottom of this page.

 <img align="right" src="https://raw.githubusercontent.com/GT4SD/gt4sd-core/main/docs/_static/gt4sd_logo.png" alt="logo" width="120" >
+This UI gives access to some pretrained language models from [*HuggingFace*](https://github.com/huggingface/) that are distributed via GT4SD.
 For **examples** and **documentation** of the model parameters, please see below.
 Moreover, we provide a **model card** ([Mitchell et al. (2019)](https://dl.acm.org/doi/abs/10.1145/3287560.3287596?casa_token=XD4eHiE2cRUAAAAA:NL11gMa1hGPOUKTAbtXnbVQBDBbjxwcjGECF_i-WC_3g1aBgU1Hbz_f2b4kI_m1in-w__1ztGeHnwHs)) at the bottom of this page.