kuznetsov-insilico
commited on
Upload 10 files
Browse files- README.md +143 -0
- config.json +29 -0
- generation_config.json +8 -0
- images/scheme.svg +0 -0
- pytorch_model.bin +3 -0
- scheme.svg +0 -0
- special_tokens_map.json +3 -0
- tokenizer.json +0 -0
- tokenizer_config.json +0 -0
- training_args.bin +3 -0
README.md
ADDED
@@ -0,0 +1,143 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
tags:
|
3 |
+
- chemistry
|
4 |
+
- medical
|
5 |
+
widget:
|
6 |
+
- text: <LIGAND>
|
7 |
+
example_title: Generate molecule
|
8 |
+
---
|
9 |
+
# BindGPT: A Scalable Framework for 3D Molecular Design via Language Modeling and Reinforcement Learning
|
10 |
+
|
11 |
+
![alt text](./images/scheme.svg "Main image")
|
12 |
+
|
13 |
+
> BindGPT is a new framework for building drug discovery models that leverages compute-efficient pretraining, supervised funetuning, prompting, reinforcement learning, and tool use of LMs. This allows BindGPT to build a single pre-trained model that exhibits state-of-the-art performance in 3D Molecule Generation, 3D Conformer Generation, Pocket-Conditioned 3D Molecule Generation, posing them as downstream tasks for a pretrained model, while previous methods build task-specialized models without task transfer abilities. At the same time, thanks to the fast transformer inference technology, BindGPT is 2 orders of magnitude (100 times) faster than previous methods at generation.
|
14 |
+
- **website:** https://bindgpt.github.io
|
15 |
+
- **Repository:** https://github.com/insilicomedicine/bindgpt
|
16 |
+
- **Paper:** https://arxiv.org/abs/2406.03686
|
17 |
+
|
18 |
+
|
19 |
+
**This page provides the pretrained version of BindGPT.**
|
20 |
+
The pretrained model is capable of zero-shot molecule generation and conformer generation within
|
21 |
+
the distribution of the [Uni-Mol](https://github.com/deepmodeling/Uni-Mol) dataset.
|
22 |
+
We also expose finetuned models:
|
23 |
+
|
24 |
+
- For the model finetuned on GEOM-DRUGS, visit [huggingface.co/insilicomedicine/BindGPT-GEOM](https://huggingface.co/insilicomedicine/BindGPT-GEOM)
|
25 |
+
- For the model finetuned with Reinforcement Learning on CrossDocked, visit [huggingface.co/insilicomedicine/BindGPT-RL](https://huggingface.co/insilicomedicine/BindGPT-RL)
|
26 |
+
|
27 |
+
|
28 |
+
## Unconditional generation
|
29 |
+
|
30 |
+
The code below provides a minimal standalone example of
|
31 |
+
sampling molecules from the model. It only depends on
|
32 |
+
`transformers`, `tokenizers`, `rdkit`, and `pytorch`
|
33 |
+
and it's not meant to reproduce the sampling speed reported
|
34 |
+
in the paper (e.g. it does not use flash-attention, mixed precision,
|
35 |
+
and large batch sampling).
|
36 |
+
To reproduce sampling speed, please use the code from our repository:
|
37 |
+
https://github.com/insilicomedicine/bindgpt
|
38 |
+
|
39 |
+
```python
|
40 |
+
# Download model from Hugginface:
|
41 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM
|
42 |
+
|
43 |
+
tokenizer = AutoTokenizer.from_pretrained("artemZholus/BindGPT")
|
44 |
+
model = AutoModelForCausalLM.from_pretrained("artemZholus/BindGPT").cuda()
|
45 |
+
|
46 |
+
# Generate 10 tokenized molecules without condition
|
47 |
+
NUM_SAMPLES = 10
|
48 |
+
|
49 |
+
start_tokens = tokenizer("<LIGAND>", return_tensors="pt")
|
50 |
+
outputs = model.generate(
|
51 |
+
# remove EOS token to continue generation
|
52 |
+
input_ids=start_tokens['input_ids'][:, :-1].cuda(),
|
53 |
+
attention_mask=start_tokens['attention_mask'][:, :-1].cuda(),
|
54 |
+
do_sample=True, max_length=400, num_return_sequences=NUM_SAMPLES
|
55 |
+
)
|
56 |
+
|
57 |
+
|
58 |
+
# parse results
|
59 |
+
import re
|
60 |
+
from rdkit import Chem
|
61 |
+
def parse_molecule(s):
|
62 |
+
try:
|
63 |
+
assert '<LIGAND>' in s and '<XYZ>' in s
|
64 |
+
_, smiles, xyz = re.split(r'<LIGAND>|<XYZ>', s)
|
65 |
+
smiles = re.sub(r'\s', '', smiles)
|
66 |
+
conf = Chem.Conformer()
|
67 |
+
mol = Chem.MolFromSmiles(smiles)
|
68 |
+
assert mol is not None
|
69 |
+
coords = list(map(float, xyz.split(' ')[2:]))
|
70 |
+
assert len(coords) == (3 * mol.GetNumAtoms())
|
71 |
+
for j in range(mol.GetNumAtoms()):
|
72 |
+
conf.SetAtomPosition(j, [coords[3*j],coords[3*j+1],coords[3*j+2]])
|
73 |
+
mol.AddConformer(conf)
|
74 |
+
return mol
|
75 |
+
except AssertionError:
|
76 |
+
return None
|
77 |
+
|
78 |
+
string_molecules = tokenizer.batch_decode(outputs, skip_special_tokens=True)
|
79 |
+
molecules = [parse_molecule(mol) for mol in string_molecules]
|
80 |
+
```
|
81 |
+
|
82 |
+
## Conformer generation
|
83 |
+
|
84 |
+
The code below provides a minimal standalone example of
|
85 |
+
sampling conformers given molecule from the model. It only depends on
|
86 |
+
`transformers`, `tokenizers`, `rdkit`, and `pytorch`
|
87 |
+
and it's not meant to reproduce the sampling speed reported
|
88 |
+
in the paper (e.g. it does not use flash-attention, mixed precision,
|
89 |
+
and large batch sampling).
|
90 |
+
To reproduce sampling speed, please use the code from our repository:
|
91 |
+
https://github.com/insilicomedicine/bindgpt
|
92 |
+
|
93 |
+
```python
|
94 |
+
smiles = [
|
95 |
+
'O=c1n(CCO)c2ccccc2n1CCO',
|
96 |
+
'Cc1ccc(C#N)cc1S(=O)(=O)NCc1ccnc(OC(C)(C)C)c1',
|
97 |
+
'COC(=O)Cc1csc(NC(=O)Cc2coc3cc(C)ccc23)n1',
|
98 |
+
]
|
99 |
+
|
100 |
+
# tell the tokenizer to right-align sequences
|
101 |
+
tokenizer.padding_side = 'left'
|
102 |
+
# Do not forget to add the <XYZ> token
|
103 |
+
# after the smiles, otherwise the model might
|
104 |
+
# want to continue generating the molecule :)
|
105 |
+
prompts = tokenizer(
|
106 |
+
["<LIGAND>" + s + '<XYZ>' for s in smiles], return_tensors="pt",
|
107 |
+
truncation=True, padding=True,
|
108 |
+
)
|
109 |
+
|
110 |
+
# Generate 1 conformer per molecule
|
111 |
+
outputs = model.generate(
|
112 |
+
# remove EOS token to continue generation
|
113 |
+
input_ids=prompts['input_ids'][:, :-1].cuda(),
|
114 |
+
attention_mask=prompts['attention_mask'][:, :-1].cuda(),
|
115 |
+
do_sample=True, max_length=400,
|
116 |
+
# you can combine this type of conditional generation
|
117 |
+
# with multi-sample generation.
|
118 |
+
# to sample many conformers per molecule, uncomment this
|
119 |
+
# num_return_sequences=10
|
120 |
+
)
|
121 |
+
|
122 |
+
# parse results
|
123 |
+
string_molecules = tokenizer.batch_decode(outputs, skip_special_tokens=True)
|
124 |
+
molecules = [parse_molecule(mol) for mol in string_molecules]
|
125 |
+
```
|
126 |
+
|
127 |
+
## Usage and License
|
128 |
+
|
129 |
+
Please note that all model weights are exclusively licensed for research purposes. The accompanying dataset is licensed under CC BY 4.0, which permits solely non-commercial usage.
|
130 |
+
We emphatically urge all users to adhere to the highest ethical standards when using our models, including maintaining fairness, transparency, and responsibility in their research. Any usage that may lead to harm or pose a detriment to society is strictly forbidden.
|
131 |
+
|
132 |
+
|
133 |
+
## References
|
134 |
+
If you use our repository, please cite the following related paper:
|
135 |
+
|
136 |
+
```
|
137 |
+
@article{zholus2021bindgpt,
|
138 |
+
author = {Artem Zholus and Maksim Kuznetsov and Roman Schutski and Rim Shayakhmetov and Daniil Polykovskiy and Sarath Chandar and Alex Zhavoronkov},
|
139 |
+
title = {BindGPT: A Scalable Framework for 3D Molecular Design via Language Modeling and Reinforcement Learning},
|
140 |
+
journal = {arXiv},
|
141 |
+
year = {2024},
|
142 |
+
}
|
143 |
+
```
|
config.json
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"architectures": [
|
3 |
+
"GPTNeoXForCausalLM"
|
4 |
+
],
|
5 |
+
"attention_dropout": 0.0,
|
6 |
+
"bos_token_id": 0,
|
7 |
+
"classifier_dropout": 0.1,
|
8 |
+
"eos_token_id": 1,
|
9 |
+
"hidden_act": "gelu",
|
10 |
+
"hidden_dropout": 0.0,
|
11 |
+
"hidden_size": 768,
|
12 |
+
"initializer_range": 0.02,
|
13 |
+
"intermediate_size": 3072,
|
14 |
+
"layer_norm_eps": 1e-05,
|
15 |
+
"max_position_embeddings": 2048,
|
16 |
+
"model_type": "gpt_neox",
|
17 |
+
"num_attention_heads": 12,
|
18 |
+
"num_hidden_layers": 15,
|
19 |
+
"pad_token_id": 3,
|
20 |
+
"rope_scaling": null,
|
21 |
+
"rotary_emb_base": 10000,
|
22 |
+
"rotary_pct": 0.25,
|
23 |
+
"tie_word_embeddings": false,
|
24 |
+
"torch_dtype": "bfloat16",
|
25 |
+
"transformers_version": "4.34.1",
|
26 |
+
"use_cache": true,
|
27 |
+
"use_parallel_residual": false,
|
28 |
+
"vocab_size": 1287
|
29 |
+
}
|
generation_config.json
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_from_model_config": true,
|
3 |
+
"bos_token_id": 0,
|
4 |
+
"eos_token_id": 1,
|
5 |
+
"pad_token_id": 3,
|
6 |
+
"transformers_version": "4.34.1",
|
7 |
+
"use_cache": false
|
8 |
+
}
|
images/scheme.svg
ADDED
pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5549da1bb564f3313b4b6236bf6651c2daa1d554c19d8d460572a26fe2d1f193
|
3 |
+
size 433250534
|
scheme.svg
ADDED
special_tokens_map.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"pad_token": "<PAD>"
|
3 |
+
}
|
tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
tokenizer_config.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:40469a685bbf0934e131f4c8953893f64c616b2bc00958629b50e6eac61ab5f2
|
3 |
+
size 8696
|