jannisborn commited on
Commit
9d0d0bd
·
unverified ·
1 Parent(s): 1298030
Files changed (4) hide show
  1. README.md +1 -1
  2. app.py +57 -45
  3. model_cards/article.md +24 -19
  4. model_cards/description.md +1 -1
README.md CHANGED
@@ -1,5 +1,5 @@
1
  ---
2
- title: GT4SD - Advanced Manufacturing
3
  emoji: 💡
4
  colorFrom: green
5
  colorTo: blue
 
1
  ---
2
+ title: GT4SD - HuggingFace text generators
3
  emoji: 💡
4
  colorFrom: green
5
  colorTo: blue
app.py CHANGED
@@ -2,40 +2,61 @@ import logging
2
  import pathlib
3
  import gradio as gr
4
  import pandas as pd
5
- from gt4sd.algorithms.controlled_sampling.advanced_manufacturing import (
6
- CatalystGenerator,
7
- AdvancedManufacturing,
 
 
 
 
 
8
  )
9
  from gt4sd.algorithms.registry import ApplicationsRegistry
10
 
11
- from utils import draw_grid_generate
12
 
13
  logger = logging.getLogger(__name__)
14
  logger.addHandler(logging.NullHandler())
15
 
 
 
 
 
 
 
 
 
 
16
 
17
  def run_inference(
18
- algorithm_version: str,
19
- target_binding_energy: float,
20
- primer_smiles: str,
21
  length: float,
22
- number_of_points: int,
23
- number_of_steps: int,
24
- number_of_samples: int,
 
 
25
  ):
 
 
26
 
27
- config = CatalystGenerator(
28
- algorithm_version=algorithm_version,
29
- number_of_points=number_of_points,
30
- number_of_steps=number_of_steps,
31
- generated_length=length,
32
- primer_smiles=primer_smiles,
 
 
 
 
 
33
  )
34
- model = AdvancedManufacturing(config, target=target_binding_energy)
35
- samples = list(model.sample(number_of_samples))
36
- seeds = [] if primer_smiles == "" else [primer_smiles]
37
 
38
- return draw_grid_generate(samples=samples, n_cols=5, seeds=seeds)
 
 
 
39
 
40
 
41
  if __name__ == "__main__":
@@ -43,10 +64,8 @@ if __name__ == "__main__":
43
  # Preparation (retrieve all available algorithms)
44
  all_algos = ApplicationsRegistry.list_available()
45
  algos = [
46
- x["algorithm_version"]
47
- for x in list(
48
- filter(lambda x: "AdvancedManufact" in x["algorithm_name"], all_algos)
49
- )
50
  ]
51
 
52
  # Load metadata
@@ -64,39 +83,32 @@ if __name__ == "__main__":
64
 
65
  demo = gr.Interface(
66
  fn=run_inference,
67
- title="Advanced Manufacturing",
68
  inputs=[
69
  gr.Dropdown(
70
  algos,
71
- label="Algorithm version",
72
- value="v0",
73
  ),
74
- gr.Slider(minimum=1, maximum=100, value=10, label="Target binding energy"),
75
  gr.Textbox(
76
- label="Primer SMILES",
77
- placeholder="FP(F)F.CP(C)c1ccccc1.[Au]",
78
  lines=1,
79
  ),
 
80
  gr.Slider(
81
- minimum=5,
82
- maximum=400,
83
- value=100,
84
- label="Maximal sequence length",
85
- step=1,
86
- ),
87
- gr.Slider(
88
- minimum=16, maximum=128, value=32, label="Number of points", step=1
89
  ),
90
- gr.Slider(
91
- minimum=16, maximum=128, value=50, label="Number of steps", step=1
92
- ),
93
- gr.Slider(
94
- minimum=1, maximum=50, value=10, label="Number of samples", step=1
95
  ),
 
 
 
96
  ],
97
- outputs=gr.HTML(label="Output"),
98
  article=article,
99
  description=description,
100
- # examples=examples.values.tolist(),
101
  )
102
  demo.launch(debug=True, show_error=True)
 
2
  import pathlib
3
  import gradio as gr
4
  import pandas as pd
5
+ from gt4sd.algorithms.generation.hugging_face import (
6
+ HuggingFaceCTRLGenerator,
7
+ HuggingFaceGenerationAlgorithm,
8
+ HuggingFaceGPT2Generator,
9
+ HuggingFaceTransfoXLGenerator,
10
+ HuggingFaceOpenAIGPTGenerator,
11
+ HuggingFaceXLMGenerator,
12
+ HuggingFaceXLNetGenerator,
13
  )
14
  from gt4sd.algorithms.registry import ApplicationsRegistry
15
 
 
16
 
17
  logger = logging.getLogger(__name__)
18
  logger.addHandler(logging.NullHandler())
19
 
20
+ MODEL_FN = {
21
+ "HuggingFaceCTRLGenerator": HuggingFaceCTRLGenerator,
22
+ "HuggingFaceGPT2Generator": HuggingFaceGPT2Generator,
23
+ "HuggingFaceTransfoXLGenerator": HuggingFaceTransfoXLGenerator,
24
+ "HuggingFaceOpenAIGPTGenerator": HuggingFaceOpenAIGPTGenerator,
25
+ "HuggingFaceXLMGenerator": HuggingFaceXLMGenerator,
26
+ "HuggingFaceXLNetGenerator": HuggingFaceXLNetGenerator,
27
+ }
28
+
29
 
30
  def run_inference(
31
+ model_type: str,
32
+ prompt: str,
 
33
  length: float,
34
+ temperature: float,
35
+ prefix: str,
36
+ k: float,
37
+ p: float,
38
+ repetition_penalty: float,
39
  ):
40
+ model = model_type.split("_")[0]
41
+ version = model_type.split("_")[1]
42
 
43
+ if model not in MODEL_FN.keys():
44
+ raise ValueError(f"Model type {model} not supported")
45
+ config = MODEL_FN[model](
46
+ algorithm_version=version,
47
+ prompt=prompt,
48
+ length=length,
49
+ temperature=temperature,
50
+ repetition_penalty=repetition_penalty,
51
+ k=k,
52
+ p=p,
53
+ prefix=prefix,
54
  )
 
 
 
55
 
56
+ model = HuggingFaceGenerationAlgorithm(config)
57
+ text = list(model.sample(1))[0]
58
+
59
+ return text
60
 
61
 
62
  if __name__ == "__main__":
 
64
  # Preparation (retrieve all available algorithms)
65
  all_algos = ApplicationsRegistry.list_available()
66
  algos = [
67
+ x["algorithm_application"] + "_" + x["algorithm_version"]
68
+ for x in list(filter(lambda x: "HuggingFace" in x["algorithm_name"], all_algos))
 
 
69
  ]
70
 
71
  # Load metadata
 
83
 
84
  demo = gr.Interface(
85
  fn=run_inference,
86
+ title="HuggingFace language models",
87
  inputs=[
88
  gr.Dropdown(
89
  algos,
90
+ label="Language model",
91
+ value="HuggingFaceGPT2Generator",
92
  ),
 
93
  gr.Textbox(
94
+ label="Text prompt",
95
+ placeholder="I'm a stochastic parrot.",
96
  lines=1,
97
  ),
98
+ gr.Slider(minimum=5, maximum=100, value=20, label="Maximal length", step=1),
99
  gr.Slider(
100
+ minimum=0.6, maximum=1.5, value=1.1, label="Decoding temperature"
 
 
 
 
 
 
 
101
  ),
102
+ gr.Textbox(
103
+ label="Prefix", placeholder="Some prefix (before the prompt)", lines=1
 
 
 
104
  ),
105
+ gr.Slider(minimum=2, maximum=500, value=50, label="Top-k", step=1),
106
+ gr.Slider(minimum=0.5, maximum=1, value=1.0, label="Decoding-p", step=1),
107
+ gr.Slider(minimum=0.5, maximum=5, value=1.0, label="Repetition penalty"),
108
  ],
109
+ outputs=gr.Textbox(label="Output"),
110
  article=article,
111
  description=description,
112
+ examples=examples.values.tolist(),
113
  )
114
  demo.launch(debug=True, show_error=True)
model_cards/article.md CHANGED
@@ -1,54 +1,60 @@
1
  # Model documentation & parameters
2
 
3
- **Algorithm Version**: Which model version to use.
4
 
5
- **Target binding energy**: The desired binding energy.
6
 
7
- **Primer SMILES**: A SMILES string used to prime the generation.
8
 
9
- **Maximal sequence length**: The maximal number of SMILES tokens in the generated molecule.
10
 
11
- **Number of points**: Number of points to sample with the Gaussian Process.
12
 
13
- **Number of steps**: Number of optimization steps in the Gaussian Process optimization.
14
 
15
- **Number of samples**: How many samples should be generated (between 1 and 50).
16
 
 
17
 
18
 
19
- # Model card -- AdvancedManufacturing
20
 
21
- **Model Details**: *AdvancedManufacturing* is a sequence-based molecular generator tuned to generate catalysts. The model relies on a recurrent Variational Autoencoder with a binding-energy predictor trained on the latent code. The framework uses Gaussian Processes for generating targeted molecules.
22
 
23
- **Developers**: Oliver Schilter and colleagues from IBM Research.
24
 
25
- **Distributors**: Original authors' code integrated into GT4SD.
26
 
27
- **Model date**: Not yet published.
28
 
29
- **Model version**: Different types of models trained on NCCR data using SMILES or SELFIES, potentially also with augmentation.
30
 
31
- **Model type**: A sequence-based molecular generator tuned to generate catalysts. The model relies on a recurrent Variational Autoencoder with a binding-energy predictor trained on the latent code. The framework uses Gaussian Processes for generating targeted molecules.
 
 
 
 
 
 
32
 
33
  **Information about training algorithms, parameters, fairness constraints or other applied approaches, and features**:
34
  N.A.
35
 
36
  **Paper or other resource for more information**:
37
- TBD
38
 
39
  **License**: MIT
40
 
41
  **Where to send questions or comments about the model**: Open an issue on [GT4SD repository](https://github.com/GT4SD/gt4sd-core).
42
 
43
- **Intended Use. Use cases that were envisioned during development**: Chemical research, in particular drug discovery.
44
 
45
- **Primary intended uses/users**: Researchers and computational chemists using the model for model comparison or research exploration purposes.
46
 
47
  **Out-of-scope use cases**: Production-level inference, producing molecules with harmful properties.
48
 
49
  **Metrics**: N.A.
50
 
51
- **Datasets**: Data provided through NCCR.
52
 
53
  **Ethical Considerations**: Unclear, please consult with original authors in case of questions.
54
 
@@ -57,7 +63,6 @@ TBD
57
  Model card prototype inspired by [Mitchell et al. (2019)](https://dl.acm.org/doi/abs/10.1145/3287560.3287596?casa_token=XD4eHiE2cRUAAAAA:NL11gMa1hGPOUKTAbtXnbVQBDBbjxwcjGECF_i-WC_3g1aBgU1Hbz_f2b4kI_m1in-w__1ztGeHnwHs)
58
 
59
  ## Citation
60
- TBD, temporarily please cite:
61
  ```bib
62
  @article{manica2022gt4sd,
63
  title={GT4SD: Generative Toolkit for Scientific Discovery},
 
1
  # Model documentation & parameters
2
 
3
+ **Language model**: Type of language model to be used.
4
 
5
+ **Text prompt**: The text prompt to condition the model.
6
 
7
+ **Maximal length**: The maximal number of SMILES tokens in the generated molecule.
8
 
9
+ **Decoding temperature**: The temperature in the beam search decoding.
10
 
11
+ **Prefix**: A text prompt that will be passed to the mode **before** the prompt.
12
 
13
+ **Top-k**: Number of top-k probability tokens to keep.
14
 
15
+ **Decoding-p**: Only tokens with cumulative probabilities summing up to this value are kept.
16
 
17
+ **Repetition penalty**: Penalty for repeating tokens. Leave unchanged, but for CTRL model, use 1.2.
18
 
19
 
 
20
 
21
+ # Model card -- HuggingFace
22
 
23
+ **Model Details**: Various Transformer-based language models.
24
 
25
+ **Developers**: HuggingFace developers
26
 
27
+ **Distributors**: HuggingFace developers' code integrated into GT4SD.
28
 
29
+ **Model date**: Varies between models.
30
 
31
+ **Model type**: Different types of `transformers` language models:
32
+ - CTRL: `CTRLLMHeadModel`
33
+ - GPT2: `GPT2LMHeadModel`
34
+ - XLNet: `XLNetLMHeadModel`
35
+ - OpenAIGPT: `OpenAIGPTLMHeadModel`
36
+ - TransfoXL: `TransfoXLLMHeadModel`
37
+ - XLM: `XLMWithLMHeadModel`
38
 
39
  **Information about training algorithms, parameters, fairness constraints or other applied approaches, and features**:
40
  N.A.
41
 
42
  **Paper or other resource for more information**:
43
+ All documentation available from [transformers documentation](https://huggingface.co/docs/transformers/)
44
 
45
  **License**: MIT
46
 
47
  **Where to send questions or comments about the model**: Open an issue on [GT4SD repository](https://github.com/GT4SD/gt4sd-core).
48
 
49
+ **Intended Use. Use cases that were envisioned during development**: N.A.
50
 
51
+ **Primary intended uses/users**: N.A.
52
 
53
  **Out-of-scope use cases**: Production-level inference, producing molecules with harmful properties.
54
 
55
  **Metrics**: N.A.
56
 
57
+ **Datasets**: N.A.
58
 
59
  **Ethical Considerations**: Unclear, please consult with original authors in case of questions.
60
 
 
63
  Model card prototype inspired by [Mitchell et al. (2019)](https://dl.acm.org/doi/abs/10.1145/3287560.3287596?casa_token=XD4eHiE2cRUAAAAA:NL11gMa1hGPOUKTAbtXnbVQBDBbjxwcjGECF_i-WC_3g1aBgU1Hbz_f2b4kI_m1in-w__1ztGeHnwHs)
64
 
65
  ## Citation
 
66
  ```bib
67
  @article{manica2022gt4sd,
68
  title={GT4SD: Generative Toolkit for Scientific Discovery},
model_cards/description.md CHANGED
@@ -1,6 +1,6 @@
1
  <img align="right" src="https://raw.githubusercontent.com/GT4SD/gt4sd-core/main/docs/_static/gt4sd_logo.png" alt="logo" width="120" >
2
 
3
- *AdvancedManufacturing* is a sequence-based molecular generator tuned to generate catalysts. The model relies on a Variational Autoencoder with a binding-energy predictor trained on the latent code. The framework uses Gaussian Processes for generating targeted molecules.
4
 
5
  For **examples** and **documentation** of the model parameters, please see below.
6
  Moreover, we provide a **model card** ([Mitchell et al. (2019)](https://dl.acm.org/doi/abs/10.1145/3287560.3287596?casa_token=XD4eHiE2cRUAAAAA:NL11gMa1hGPOUKTAbtXnbVQBDBbjxwcjGECF_i-WC_3g1aBgU1Hbz_f2b4kI_m1in-w__1ztGeHnwHs)) at the bottom of this page.
 
1
  <img align="right" src="https://raw.githubusercontent.com/GT4SD/gt4sd-core/main/docs/_static/gt4sd_logo.png" alt="logo" width="120" >
2
 
3
+ This UI gives access to some pretrained language models from [*HuggingFace*](https://github.com/huggingface/) that are distributed via GT4SD.
4
 
5
  For **examples** and **documentation** of the model parameters, please see below.
6
  Moreover, we provide a **model card** ([Mitchell et al. (2019)](https://dl.acm.org/doi/abs/10.1145/3287560.3287596?casa_token=XD4eHiE2cRUAAAAA:NL11gMa1hGPOUKTAbtXnbVQBDBbjxwcjGECF_i-WC_3g1aBgU1Hbz_f2b4kI_m1in-w__1ztGeHnwHs)) at the bottom of this page.