carsonhxsu
commited on
Commit
·
f50d964
1
Parent(s):
4f41ca5
init code
Browse files- .gitattributes +1 -0
- .gitignore +6 -0
- README.md +112 -0
- demo.py +26 -0
- lyra_baichuan/__init__.py +1 -0
- lyra_baichuan/config.py +34 -0
- lyra_baichuan/lyra_baichuan.py +367 -0
- lyra_baichuan/model.py +166 -0
- lyra_baichuan/tokenization_baichuan.py +232 -0
- requirements.txt +6 -0
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
*.so filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
dist/
|
2 |
+
*.egg-info/
|
3 |
+
__pycache__
|
4 |
+
build/
|
5 |
+
.vscode
|
6 |
+
.idea
|
README.md
CHANGED
@@ -1,3 +1,115 @@
|
|
1 |
---
|
2 |
license: mit
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
---
|
2 |
license: mit
|
3 |
+
language: en
|
4 |
+
tags:
|
5 |
+
- LLM
|
6 |
+
- Baichuan-7B
|
7 |
+
- Baichuan-13B
|
8 |
+
- Baichuan2-7B
|
9 |
+
- Baichuan2-13B
|
10 |
---
|
11 |
+
## Model Card for lyraBaichuan
|
12 |
+
|
13 |
+
lyraBaichuan is currently the **fastest Baichuan models** (Baichuan-7B, Baichuan-13B, Baichuan2-7B, Baichuan2-13B) available. The inference speed of lyraBaichuan has achieved up to **4300+ tokens/s** on A100, up to **2.4x** acceleration upon the torch version.
|
14 |
+
|
15 |
+
Among its main features are:
|
16 |
+
- device: Nvidia GPU with Amperer architecture or Volta architecture (A100 or higher, V100).
|
17 |
+
- batch_size: compiled with dynamic batch size, maximum depends on device.
|
18 |
+
- MEMOPT mode: significantly optimized VRAM usage and increased speed
|
19 |
+
|
20 |
+
We use the Baichuan2-7B-Base and Baichuan2-13B-Base model for measurement, but this optimized inference is also applicable to other Baichuan models, including Baichuan-7B and Baichuan-13B.
|
21 |
+
|
22 |
+
## Speed
|
23 |
+
|
24 |
+
* Evaluated at tokens/s
|
25 |
+
* test on A100 40G
|
26 |
+
* MEMOPT mode
|
27 |
+
|
28 |
+
### Baichuan2-7B-Base
|
29 |
+
|
30 |
+
| Version | Batch Size 1 | Batch Size 8 | Batch Size 16 | Batch Size 32 | Batch Size 64 |
|
31 |
+
| --- | --- | --- | --- | --- | --- |
|
32 |
+
| Torch 2.0.1 | 41.2 | 323.2 | 640.0 | 1256.8 | 2231.0 |
|
33 |
+
| lyraXVERSE MEMOPT | 125.9 | 948.1 | 1749.3 | 2974.0 | 4370.1 |
|
34 |
+
|
35 |
+
### Baichuan2-13B-Base
|
36 |
+
|
37 |
+
| Version | Batch Size 1 | Batch Size 8 | Batch Size 16 | Batch Size 32 | Batch Size 64 |
|
38 |
+
| --- | --- | --- | --- | --- | --- |
|
39 |
+
| Torch 2.0.1 | 40.9 | 307.9 | 555.6 | 1010.4 | 1601.0 |
|
40 |
+
| lyraXVERSE MEMOPT | 80.0 | 568.2 | 1124.4 | 1942.6 | 2828.0 |
|
41 |
+
|
42 |
+
## Docker Environment Recommendation
|
43 |
+
|
44 |
+
- For Cuda 11.X: we recommend ```nvcr.io/nvidia/pytorch:22.12-py3```
|
45 |
+
- For Cuda 12.0: we recommend ```nvcr.io/nvidia/pytorch:23.02-py3```
|
46 |
+
|
47 |
+
```bash
|
48 |
+
docker pull nvcr.io/nvidia/pytorch:23.02-py3
|
49 |
+
docker run --rm -it --gpus all -v ./:/lyraBaichuan nvcr.io/nvidia/pytorch:23.02-py3
|
50 |
+
|
51 |
+
pip install -r requirements.txt
|
52 |
+
python demo.py
|
53 |
+
```
|
54 |
+
|
55 |
+
## Uses
|
56 |
+
|
57 |
+
```python
|
58 |
+
from lyra_baichuan import lyraBaichuan7B, lyraBaichuan13B
|
59 |
+
|
60 |
+
model_path = "./models/Baichuan2-13B-lyra"
|
61 |
+
tokenizer_path = "./models/Baichuan2-13B-lyra"
|
62 |
+
inference_dtype = 'fp16'
|
63 |
+
prompt = "登鹳雀楼->王之涣\n夜雨寄北->"
|
64 |
+
|
65 |
+
memopt_mode = 1
|
66 |
+
max_output_length = 64
|
67 |
+
arch = "Ampere" # Ampere or Volta
|
68 |
+
cuda_version = 12 # cuda version, we currently support 11 and 12
|
69 |
+
|
70 |
+
# To use 7B model, initialize with lyraBaichuan7B
|
71 |
+
model = lyraBaichuan13B(model_path,
|
72 |
+
tokenizer_path = tokenizer_path,
|
73 |
+
dtype = inference_dtype,
|
74 |
+
memopt_mode = memopt_mode,
|
75 |
+
arch = arch,
|
76 |
+
cuda_version = cuda_version)
|
77 |
+
|
78 |
+
bs = 1
|
79 |
+
prompts = [prompt, ] * bs
|
80 |
+
output_texts = model.generate(
|
81 |
+
prompts, output_length=max_output_length,
|
82 |
+
top_k=30, top_p=0.85, temperature=1.0, repetition_penalty=1.0, do_sample=False)
|
83 |
+
|
84 |
+
print(output_texts)
|
85 |
+
```
|
86 |
+
|
87 |
+
## Demo Outputs
|
88 |
+
|
89 |
+
### Baichuan2-13B-Base
|
90 |
+
#### input
|
91 |
+
|
92 |
+
登鹳雀楼->王之涣
|
93 |
+
|
94 |
+
夜雨寄北->
|
95 |
+
|
96 |
+
#### output
|
97 |
+
|
98 |
+
## TODO
|
99 |
+
1. Support for int4
|
100 |
+
2. Inference for longer context situations
|
101 |
+
3. Streaming inference mode.
|
102 |
+
|
103 |
+
## Citation
|
104 |
+
``` bibtex
|
105 |
+
@Misc{lyraBaichuan2023,
|
106 |
+
author = {Haoxiong Su, Kangjian Wu, Zhengtao Wang, Yibo Lu, Bin Wu},
|
107 |
+
title = {lyraBaichuan: Accelerating Baichuan models to 4300+ tokens/s},
|
108 |
+
howpublished = {\url{https://huggingface.co/TMElyralab/lyraBaichuan}},
|
109 |
+
year = {2023}
|
110 |
+
}
|
111 |
+
```
|
112 |
+
|
113 |
+
## Report bug
|
114 |
+
- start a discussion to report any bugs!--> https://huggingface.co/TMElyralab/lyraBaichuan
|
115 |
+
- report bug with a `[bug]` mark in the title.
|
demo.py
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from lyra_baichuan import lyraBaichuan7B, lyraBaichuan13B
|
2 |
+
|
3 |
+
model_path = "./models/Baichuan2-13B-lyra"
|
4 |
+
tokenizer_path = "./models/Baichuan2-13B-lyra"
|
5 |
+
inference_dtype = 'fp16'
|
6 |
+
prompt = "登鹳雀楼->王之涣\n夜雨寄北->"
|
7 |
+
|
8 |
+
memopt_mode = 1
|
9 |
+
max_output_length = 64
|
10 |
+
arch = "Ampere" # Ampere or Volta
|
11 |
+
cuda_version = 12 # cuda version, we currently support 11 and 12
|
12 |
+
|
13 |
+
model = lyraBaichuan13B(model_path,
|
14 |
+
tokenizer_path = tokenizer_path,
|
15 |
+
dtype = inference_dtype,
|
16 |
+
memopt_mode = memopt_mode,
|
17 |
+
arch = arch,
|
18 |
+
cuda_version = cuda_version)
|
19 |
+
|
20 |
+
bs = 1
|
21 |
+
prompts = [prompt, ] * bs
|
22 |
+
output_texts = model.generate(
|
23 |
+
prompts, output_length=max_output_length,
|
24 |
+
top_k=30, top_p=0.85, temperature=1.0, repetition_penalty=1.0, do_sample=False)
|
25 |
+
|
26 |
+
print(output_texts)
|
lyra_baichuan/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
from .lyra_baichuan import lyraBaichuan7B, lyraBaichuan13B
|
lyra_baichuan/config.py
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import dataclasses
|
2 |
+
from typing import Optional
|
3 |
+
|
4 |
+
|
5 |
+
@dataclasses.dataclass
|
6 |
+
class LyraBaichuanParam:
|
7 |
+
num_heads: int = 40
|
8 |
+
size_per_head: int = 128
|
9 |
+
inter_size: int = 13824
|
10 |
+
num_layers: int = 40
|
11 |
+
vocab_size: int = 39424
|
12 |
+
start_id: Optional[int] = 1
|
13 |
+
end_id: Optional[int] = 2
|
14 |
+
tensor_para_size: int = 1
|
15 |
+
pipeline_para_size: int = 1
|
16 |
+
remove_padding: bool = True
|
17 |
+
shared_contexts_ratio: float = 1.0
|
18 |
+
layernorm_eps: float = 1e-6
|
19 |
+
weights_data_type: str = "fp16"
|
20 |
+
rotary_embedding: int = 128
|
21 |
+
use_gptj_residual: bool = False
|
22 |
+
|
23 |
+
def __post_init__(self):
|
24 |
+
if not 0.0 <= self.shared_contexts_ratio <= 1.0:
|
25 |
+
raise ValueError(
|
26 |
+
f'Got an invalid value of shared_context_ratio '
|
27 |
+
f'{self.shared_contexts_ratio} - range: [0.0, 1.0]')
|
28 |
+
|
29 |
+
def asdict(self):
|
30 |
+
return dataclasses.asdict(self)
|
31 |
+
|
32 |
+
|
33 |
+
LYRA_BAICHUAN_PARAM = LyraBaichuanParam()
|
34 |
+
LIB_SO_PATH = '/usr/lib/ftlib/libth_transformer.so'
|
lyra_baichuan/lyra_baichuan.py
ADDED
@@ -0,0 +1,367 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from __future__ import annotations
|
2 |
+
|
3 |
+
import configparser
|
4 |
+
import pathlib
|
5 |
+
import typing
|
6 |
+
import os
|
7 |
+
|
8 |
+
import torch
|
9 |
+
import transformers
|
10 |
+
from torch.nn.utils.rnn import pad_sequence
|
11 |
+
|
12 |
+
from .config import LYRA_BAICHUAN_PARAM, LIB_SO_PATH
|
13 |
+
from .model import BaichuanModel
|
14 |
+
from .tokenization_baichuan import BaichuanTokenizer
|
15 |
+
|
16 |
+
class lyraBaichuan7B:
|
17 |
+
def __init__(self, model_path, tokenizer_path=None, dtype='fp16', memopt_mode=0, arch='Ampere', cuda_version=12) -> None:
|
18 |
+
self.model_path = model_path
|
19 |
+
self.tokenizer_path = tokenizer_path
|
20 |
+
self.dtype = dtype
|
21 |
+
self.memopt_mode = memopt_mode
|
22 |
+
self.arch = arch
|
23 |
+
self.cuda_version = cuda_version
|
24 |
+
|
25 |
+
self.model, self.tokenizer = self.load_model_and_tokenizer()
|
26 |
+
print("Got model and tokenizer")
|
27 |
+
|
28 |
+
def load_model_and_tokenizer(self):
|
29 |
+
if self.tokenizer_path is None:
|
30 |
+
tokenizer_path = self.model_path
|
31 |
+
else:
|
32 |
+
tokenizer_path = self.tokenizer_path
|
33 |
+
|
34 |
+
print(f'Loading tokenizer from {tokenizer_path}')
|
35 |
+
tokenizer = BaichuanTokenizer.from_pretrained(tokenizer_path)
|
36 |
+
|
37 |
+
checkpoint_path = pathlib.Path(self.model_path)
|
38 |
+
config_path = checkpoint_path / 'config.ini'
|
39 |
+
|
40 |
+
if config_path.exists():
|
41 |
+
# Read model params from config.
|
42 |
+
cfg = configparser.ConfigParser()
|
43 |
+
cfg.read(config_path)
|
44 |
+
model_name = 'baichuan'
|
45 |
+
inference_data_type = self.dtype
|
46 |
+
if inference_data_type == None:
|
47 |
+
inference_data_type = cfg.get(model_name, "weight_data_type")
|
48 |
+
model_args = dict(
|
49 |
+
head_num=cfg.getint(model_name, 'head_num'),
|
50 |
+
size_per_head=cfg.getint(model_name, "size_per_head"),
|
51 |
+
inter_size=cfg.getint(model_name, 'inter_size'),
|
52 |
+
layer_num=cfg.getint(model_name, "num_layer"),
|
53 |
+
rotary_embedding_dim=cfg.getint(model_name, 'rotary_embedding'),
|
54 |
+
layernorm_eps=cfg.getfloat(model_name, 'layernorm_eps'),
|
55 |
+
vocab_size=cfg.getint(model_name, "vocab_size"),
|
56 |
+
start_id=cfg.getint(model_name, "start_id"),
|
57 |
+
end_id=cfg.getint(model_name, "end_id"),
|
58 |
+
weights_data_type=cfg.get(model_name, "weight_data_type"),
|
59 |
+
tensor_para_size=cfg.getint(model_name, "tensor_para_size"),
|
60 |
+
inference_data_type=inference_data_type)
|
61 |
+
else:
|
62 |
+
inference_data_type = self.dtype
|
63 |
+
if inference_data_type == None:
|
64 |
+
inference_data_type = LYRA_BAICHUAN_PARAM.weights_data_type
|
65 |
+
model_args = dict(head_num=LYRA_BAICHUAN_PARAM.num_heads,
|
66 |
+
size_per_head=LYRA_BAICHUAN_PARAM.size_per_head,
|
67 |
+
inter_size=LYRA_BAICHUAN_PARAM.inter_size,
|
68 |
+
layer_num=LYRA_BAICHUAN_PARAM.num_layers,
|
69 |
+
rotary_embedding_dim=LYRA_BAICHUAN_PARAM.rotary_embedding,
|
70 |
+
layernorm_eps=LYRA_BAICHUAN_PARAM.layernorm_eps,
|
71 |
+
vocab_size=LYRA_BAICHUAN_PARAM.vocab_size,
|
72 |
+
start_id=LYRA_BAICHUAN_PARAM.start_id or tokenizer.bos_token_id,
|
73 |
+
end_id=LYRA_BAICHUAN_PARAM.end_id or tokenizer.eos_token_id,
|
74 |
+
weights_data_type=LYRA_BAICHUAN_PARAM.weights_data_type,
|
75 |
+
tensor_para_size=LYRA_BAICHUAN_PARAM.tensor_para_size,
|
76 |
+
inference_data_type=inference_data_type)
|
77 |
+
|
78 |
+
# update common parameters
|
79 |
+
|
80 |
+
# Load the C++ model into Pytorch model.
|
81 |
+
sm = "sm80"
|
82 |
+
|
83 |
+
if self.arch == "Ampere":
|
84 |
+
sm = "sm80"
|
85 |
+
elif self.arch == "Volta":
|
86 |
+
sm = "sm70"
|
87 |
+
else:
|
88 |
+
raise Exception(f"unsupported arch: {self.arch}")
|
89 |
+
|
90 |
+
cu = 'cu11'
|
91 |
+
if self.cuda_version == 11:
|
92 |
+
cu = 'cu11'
|
93 |
+
elif self.cuda_version == 12:
|
94 |
+
cu = 'cu12'
|
95 |
+
else:
|
96 |
+
raise Exception(f"unsupported cuda version: {self.cuda_version}")
|
97 |
+
|
98 |
+
lib_path = pathlib.Path(__file__).parent / "ftlib" / f"libth_transformer_{sm}_{cu}.so"
|
99 |
+
|
100 |
+
model_args.update(dict(
|
101 |
+
lib_path=lib_path,
|
102 |
+
model_path=os.path.join(self.model_path, "1-gpu-fp16.bin"),
|
103 |
+
max_seq_len=0, # for position seq embedding
|
104 |
+
pipeline_para_size=LYRA_BAICHUAN_PARAM.pipeline_para_size,
|
105 |
+
use_gptj_residual=LYRA_BAICHUAN_PARAM.use_gptj_residual,
|
106 |
+
memopt_mode=self.memopt_mode
|
107 |
+
))
|
108 |
+
|
109 |
+
print('[FT][INFO] Load Our FT Highly Optimized Baichuan-7B model')
|
110 |
+
for k, v in model_args.items():
|
111 |
+
print(f' - {k.ljust(25, ".")}: {v}')
|
112 |
+
|
113 |
+
# Check sanity and consistency between the model and tokenizer.
|
114 |
+
checklist = ['head_num', 'size_per_head', 'vocab_size', 'layer_num',
|
115 |
+
'tensor_para_size', 'tensor_para_size', 'weights_data_type']
|
116 |
+
if None in [model_args[k] for k in checklist]:
|
117 |
+
none_params = [p for p in checklist if model_args[p] is None]
|
118 |
+
print(f'[FT][WARNING] Found None parameters {none_params}. They must '
|
119 |
+
f'be provided either by config file or CLI arguments.')
|
120 |
+
if model_args['start_id'] != tokenizer.bos_token_id:
|
121 |
+
print('[FT][WARNING] Given start_id is not matched with the bos token '
|
122 |
+
'id of the pretrained tokenizer.')
|
123 |
+
if model_args['end_id'] not in (tokenizer.pad_token_id, tokenizer.eos_token_id):
|
124 |
+
print('[FT][WARNING] Given end_id is not matched with neither pad '
|
125 |
+
'token id nor eos token id of the pretrained tokenizer.')
|
126 |
+
|
127 |
+
print(f'Loading model from {self.model_path}')
|
128 |
+
model = BaichuanModel(**model_args)
|
129 |
+
return model, tokenizer
|
130 |
+
|
131 |
+
def generate(self, prompts: typing.List[str] | str,
|
132 |
+
output_length: int = 512,
|
133 |
+
beam_width: int = 1,
|
134 |
+
top_k: typing.Optional[torch.IntTensor] = 1,
|
135 |
+
top_p: typing.Optional[torch.FloatTensor] = 1.0,
|
136 |
+
beam_search_diversity_rate: typing.Optional[torch.FloatTensor] = 0.0,
|
137 |
+
temperature: typing.Optional[torch.FloatTensor] = 1.0,
|
138 |
+
len_penalty: typing.Optional[torch.FloatTensor] = 0.0,
|
139 |
+
repetition_penalty: typing.Optional[torch.FloatTensor] = 1.0,
|
140 |
+
presence_penalty: typing.Optional[torch.FloatTensor] = None,
|
141 |
+
min_length: typing.Optional[torch.IntTensor] = None,
|
142 |
+
bad_words_list: typing.Optional[torch.IntTensor] = None,
|
143 |
+
do_sample: bool = False,
|
144 |
+
return_output_length: bool = False,
|
145 |
+
return_cum_log_probs: int = 0):
|
146 |
+
#
|
147 |
+
if isinstance(prompts, str):
|
148 |
+
prompts = [prompts, ]
|
149 |
+
|
150 |
+
inputs = prompts
|
151 |
+
|
152 |
+
batch_size = len(inputs)
|
153 |
+
ones_int = torch.ones(size=[batch_size], dtype=torch.int32)
|
154 |
+
ones_float = torch.ones(size=[batch_size], dtype=torch.float32)
|
155 |
+
|
156 |
+
# we must encode the raw prompt text one by one in order to compute the length of the original text.
|
157 |
+
input_token_ids = [self.tokenizer(text, return_tensors="pt").input_ids.int().squeeze() for text in inputs]
|
158 |
+
input_lengths = torch.IntTensor([len(ids) for ids in input_token_ids])
|
159 |
+
# after got the length of each input text tokens. we can batchfy the input list to a tensor. padding the right.
|
160 |
+
input_token_ids = pad_sequence(input_token_ids, batch_first=True, padding_value=self.tokenizer.eos_token_id)
|
161 |
+
|
162 |
+
random_seed = None
|
163 |
+
if do_sample:
|
164 |
+
random_seed = torch.randint(0, 262144, (batch_size,), dtype=torch.long)
|
165 |
+
|
166 |
+
outputs = self.model(start_ids=input_token_ids,
|
167 |
+
start_lengths=input_lengths,
|
168 |
+
output_len=output_length,
|
169 |
+
beam_width=beam_width,
|
170 |
+
top_k=top_k * ones_int,
|
171 |
+
top_p=top_p * ones_float,
|
172 |
+
beam_search_diversity_rate=beam_search_diversity_rate * ones_float,
|
173 |
+
temperature=temperature * ones_float,
|
174 |
+
len_penalty=len_penalty * ones_float,
|
175 |
+
repetition_penalty=repetition_penalty * ones_float,
|
176 |
+
random_seed=random_seed,
|
177 |
+
return_output_length=return_output_length,
|
178 |
+
return_cum_log_probs=return_cum_log_probs)
|
179 |
+
|
180 |
+
if return_cum_log_probs > 0:
|
181 |
+
outputs = outputs[0] # output_token_ids.
|
182 |
+
|
183 |
+
# Slice the generated token ids of the 1st beam result.
|
184 |
+
# output = input tokens + generated tokens.
|
185 |
+
output_token_ids = [out[0, length:].cpu()
|
186 |
+
for out, length in zip(outputs, input_lengths)]
|
187 |
+
|
188 |
+
output_texts = self.tokenizer.batch_decode(
|
189 |
+
output_token_ids, skip_special_tokens=True)
|
190 |
+
|
191 |
+
return output_texts
|
192 |
+
|
193 |
+
class lyraBaichuan13B:
|
194 |
+
def __init__(self, model_path, tokenizer_path=None, dtype='fp16', memopt_mode=0, arch='Ampere', cuda_version=12) -> None:
|
195 |
+
self.model_path = model_path
|
196 |
+
self.tokenizer_path = tokenizer_path
|
197 |
+
self.dtype = dtype
|
198 |
+
self.memopt_mode = memopt_mode
|
199 |
+
self.arch = arch
|
200 |
+
self.cuda_version = cuda_version
|
201 |
+
|
202 |
+
self.model, self.tokenizer = self.load_model_and_tokenizer()
|
203 |
+
print("Got model and tokenizer")
|
204 |
+
|
205 |
+
def load_model_and_tokenizer(self):
|
206 |
+
if self.tokenizer_path is None:
|
207 |
+
tokenizer_path = self.model_path
|
208 |
+
else:
|
209 |
+
tokenizer_path = self.tokenizer_path
|
210 |
+
|
211 |
+
print(f'Loading tokenizer from {tokenizer_path}')
|
212 |
+
tokenizer = BaichuanTokenizer.from_pretrained(tokenizer_path)
|
213 |
+
|
214 |
+
checkpoint_path = pathlib.Path(self.model_path)
|
215 |
+
config_path = checkpoint_path / 'config.ini'
|
216 |
+
|
217 |
+
if config_path.exists():
|
218 |
+
# Read model params from config.
|
219 |
+
cfg = configparser.ConfigParser()
|
220 |
+
cfg.read(config_path)
|
221 |
+
model_name = 'baichuan'
|
222 |
+
inference_data_type = self.dtype
|
223 |
+
if inference_data_type == None:
|
224 |
+
inference_data_type = cfg.get(model_name, "weight_data_type")
|
225 |
+
model_args = dict(
|
226 |
+
head_num=cfg.getint(model_name, 'head_num'),
|
227 |
+
size_per_head=cfg.getint(model_name, "size_per_head"),
|
228 |
+
inter_size=cfg.getint(model_name, 'inter_size'),
|
229 |
+
layer_num=cfg.getint(model_name, "num_layer"),
|
230 |
+
rotary_embedding_dim=0,
|
231 |
+
layernorm_eps=cfg.getfloat(model_name, 'layernorm_eps'),
|
232 |
+
vocab_size=cfg.getint(model_name, "vocab_size"),
|
233 |
+
start_id=cfg.getint(model_name, "start_id"),
|
234 |
+
end_id=cfg.getint(model_name, "end_id"),
|
235 |
+
weights_data_type=cfg.get(model_name, "weight_data_type"),
|
236 |
+
tensor_para_size=cfg.getint(model_name, "tensor_para_size"),
|
237 |
+
inference_data_type=inference_data_type)
|
238 |
+
else:
|
239 |
+
inference_data_type = self.dtype
|
240 |
+
if inference_data_type == None:
|
241 |
+
inference_data_type = LYRA_BAICHUAN_PARAM.weights_data_type
|
242 |
+
model_args = dict(head_num=LYRA_BAICHUAN_PARAM.num_heads,
|
243 |
+
size_per_head=LYRA_BAICHUAN_PARAM.size_per_head,
|
244 |
+
inter_size=LYRA_BAICHUAN_PARAM.inter_size,
|
245 |
+
layer_num=LYRA_BAICHUAN_PARAM.num_layers,
|
246 |
+
rotary_embedding_dim=0,
|
247 |
+
layernorm_eps=LYRA_BAICHUAN_PARAM.layernorm_eps,
|
248 |
+
vocab_size=LYRA_BAICHUAN_PARAM.vocab_size,
|
249 |
+
start_id=LYRA_BAICHUAN_PARAM.start_id or tokenizer.bos_token_id,
|
250 |
+
end_id=LYRA_BAICHUAN_PARAM.end_id or tokenizer.eos_token_id,
|
251 |
+
weights_data_type=LYRA_BAICHUAN_PARAM.weights_data_type,
|
252 |
+
tensor_para_size=LYRA_BAICHUAN_PARAM.tensor_para_size,
|
253 |
+
inference_data_type=inference_data_type)
|
254 |
+
|
255 |
+
# update common parameters
|
256 |
+
# Load the C++ model into Pytorch model.
|
257 |
+
sm = "sm80"
|
258 |
+
|
259 |
+
if self.arch == "Ampere":
|
260 |
+
sm = "sm80"
|
261 |
+
elif self.arch == "Volta":
|
262 |
+
sm = "sm70"
|
263 |
+
else:
|
264 |
+
raise Exception(f"unsupported arch: {self.arch}")
|
265 |
+
|
266 |
+
cu = 'cu11'
|
267 |
+
if self.cuda_version == 11:
|
268 |
+
cu = 'cu11'
|
269 |
+
elif self.cuda_version == 12:
|
270 |
+
cu = 'cu12'
|
271 |
+
else:
|
272 |
+
raise Exception(f"unsupported cuda version: {self.cuda_version}")
|
273 |
+
|
274 |
+
lib_path = pathlib.Path(__file__).parent / "ftlib" / f"libth_transformer_{sm}_{cu}.so"
|
275 |
+
model_args.update(dict(
|
276 |
+
lib_path=lib_path,
|
277 |
+
model_path=os.path.join(self.model_path, "1-gpu-fp16.bin"),
|
278 |
+
max_seq_len=0, # for position seq embedding
|
279 |
+
pipeline_para_size=LYRA_BAICHUAN_PARAM.pipeline_para_size,
|
280 |
+
use_gptj_residual=LYRA_BAICHUAN_PARAM.use_gptj_residual,
|
281 |
+
memopt_mode=self.memopt_mode
|
282 |
+
))
|
283 |
+
|
284 |
+
print('[FT][INFO] Load Our FT Highly Optimized Baichuan-13B model')
|
285 |
+
for k, v in model_args.items():
|
286 |
+
print(f' - {k.ljust(25, ".")}: {v}')
|
287 |
+
|
288 |
+
# Check sanity and consistency between the model and tokenizer.
|
289 |
+
checklist = ['head_num', 'size_per_head', 'vocab_size', 'layer_num',
|
290 |
+
'tensor_para_size', 'tensor_para_size', 'weights_data_type']
|
291 |
+
if None in [model_args[k] for k in checklist]:
|
292 |
+
none_params = [p for p in checklist if model_args[p] is None]
|
293 |
+
print(f'[FT][WARNING] Found None parameters {none_params}. They must '
|
294 |
+
f'be provided either by config file or CLI arguments.')
|
295 |
+
if model_args['start_id'] != tokenizer.bos_token_id:
|
296 |
+
print('[FT][WARNING] Given start_id is not matched with the bos token '
|
297 |
+
'id of the pretrained tokenizer.')
|
298 |
+
if model_args['end_id'] not in (tokenizer.pad_token_id, tokenizer.eos_token_id):
|
299 |
+
print('[FT][WARNING] Given end_id is not matched with neither pad '
|
300 |
+
'token id nor eos token id of the pretrained tokenizer.')
|
301 |
+
|
302 |
+
print(f'Loading model from {self.model_path}')
|
303 |
+
model = BaichuanModel(**model_args)
|
304 |
+
return model, tokenizer
|
305 |
+
|
306 |
+
def generate(self, prompts: typing.List[str] | str,
|
307 |
+
output_length: int = 512,
|
308 |
+
beam_width: int = 1,
|
309 |
+
top_k: typing.Optional[torch.IntTensor] = 1,
|
310 |
+
top_p: typing.Optional[torch.FloatTensor] = 1.0,
|
311 |
+
beam_search_diversity_rate: typing.Optional[torch.FloatTensor] = 0.0,
|
312 |
+
temperature: typing.Optional[torch.FloatTensor] = 1.0,
|
313 |
+
len_penalty: typing.Optional[torch.FloatTensor] = 0.0,
|
314 |
+
repetition_penalty: typing.Optional[torch.FloatTensor] = 1.0,
|
315 |
+
presence_penalty: typing.Optional[torch.FloatTensor] = None,
|
316 |
+
min_length: typing.Optional[torch.IntTensor] = None,
|
317 |
+
bad_words_list: typing.Optional[torch.IntTensor] = None,
|
318 |
+
do_sample: bool = False,
|
319 |
+
return_output_length: bool = False,
|
320 |
+
return_cum_log_probs: int = 0):
|
321 |
+
#
|
322 |
+
if isinstance(prompts, str):
|
323 |
+
prompts = [prompts, ]
|
324 |
+
|
325 |
+
inputs = prompts
|
326 |
+
|
327 |
+
batch_size = len(inputs)
|
328 |
+
ones_int = torch.ones(size=[batch_size], dtype=torch.int32)
|
329 |
+
ones_float = torch.ones(size=[batch_size], dtype=torch.float32)
|
330 |
+
|
331 |
+
# we must encode the raw prompt text one by one in order to compute the length of the original text.
|
332 |
+
input_token_ids = [self.tokenizer(text, return_tensors="pt").input_ids.int().squeeze() for text in inputs]
|
333 |
+
input_lengths = torch.IntTensor([len(ids) for ids in input_token_ids])
|
334 |
+
# after got the length of each input text tokens. we can batchfy the input list to a tensor. padding the right.
|
335 |
+
input_token_ids = pad_sequence(input_token_ids, batch_first=True, padding_value=self.tokenizer.eos_token_id)
|
336 |
+
|
337 |
+
random_seed = None
|
338 |
+
if do_sample:
|
339 |
+
random_seed = torch.randint(0, 262144, (batch_size,), dtype=torch.long)
|
340 |
+
|
341 |
+
outputs = self.model(start_ids=input_token_ids,
|
342 |
+
start_lengths=input_lengths,
|
343 |
+
output_len=output_length,
|
344 |
+
beam_width=beam_width,
|
345 |
+
top_k=top_k * ones_int,
|
346 |
+
top_p=top_p * ones_float,
|
347 |
+
beam_search_diversity_rate=beam_search_diversity_rate * ones_float,
|
348 |
+
temperature=temperature * ones_float,
|
349 |
+
len_penalty=len_penalty * ones_float,
|
350 |
+
repetition_penalty=repetition_penalty * ones_float,
|
351 |
+
random_seed=random_seed,
|
352 |
+
return_output_length=return_output_length,
|
353 |
+
return_cum_log_probs=return_cum_log_probs)
|
354 |
+
|
355 |
+
if return_cum_log_probs > 0:
|
356 |
+
outputs = outputs[0] # output_token_ids.
|
357 |
+
|
358 |
+
# Slice the generated token ids of the 1st beam result.
|
359 |
+
# output = input tokens + generated tokens.
|
360 |
+
output_token_ids = [out[0, length:].cpu()
|
361 |
+
for out, length in zip(outputs, input_lengths)]
|
362 |
+
|
363 |
+
output_texts = self.tokenizer.batch_decode(
|
364 |
+
output_token_ids, skip_special_tokens=True)
|
365 |
+
|
366 |
+
return output_texts
|
367 |
+
|
lyra_baichuan/model.py
ADDED
@@ -0,0 +1,166 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
|
2 |
+
#
|
3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
# you may not use this file except in compliance with the License.
|
5 |
+
# You may obtain a copy of the License at
|
6 |
+
#
|
7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
#
|
9 |
+
# Unless required by applicable law or agreed to in writing, software
|
10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# See the License for the specific language governing permissions and
|
13 |
+
# limitations under the License.
|
14 |
+
|
15 |
+
from __future__ import print_function
|
16 |
+
|
17 |
+
import copy
|
18 |
+
import os
|
19 |
+
import pathlib
|
20 |
+
import typing
|
21 |
+
|
22 |
+
import numpy as np
|
23 |
+
import torch
|
24 |
+
import torch.distributed as dist
|
25 |
+
import torch.nn as nn
|
26 |
+
|
27 |
+
str_type_map = {"fp32": torch.float32, "fp16": torch.float16, "bf16": torch.bfloat16}
|
28 |
+
|
29 |
+
class BaichuanModel(nn.Module):
|
30 |
+
def __init__(self,
|
31 |
+
head_num,
|
32 |
+
size_per_head,
|
33 |
+
inter_size,
|
34 |
+
vocab_size,
|
35 |
+
rotary_embedding_dim,
|
36 |
+
start_id, end_id, layer_num,
|
37 |
+
max_seq_len: int,
|
38 |
+
layernorm_eps,
|
39 |
+
tensor_para_size: int,
|
40 |
+
pipeline_para_size: int,
|
41 |
+
use_gptj_residual,
|
42 |
+
lib_path: typing.Union[str, pathlib.Path],
|
43 |
+
model_path,
|
44 |
+
memopt_mode: int = 0,
|
45 |
+
inference_data_type: str = "fp16",
|
46 |
+
weights_data_type: typing.Union[str, np.dtype] = np.float32):
|
47 |
+
super().__init__()
|
48 |
+
self.head_num = head_num
|
49 |
+
self.size_per_head = size_per_head
|
50 |
+
self.inter_size = inter_size
|
51 |
+
self.vocab_size = vocab_size
|
52 |
+
self.rotary_embedding_dim = rotary_embedding_dim
|
53 |
+
self.start_id = start_id
|
54 |
+
self.end_id = end_id
|
55 |
+
self.max_seq_len = max_seq_len
|
56 |
+
self.layer_num = layer_num
|
57 |
+
self.use_gptj_residual = use_gptj_residual
|
58 |
+
self.layernorm_eps = layernorm_eps
|
59 |
+
self.memopt_mode = memopt_mode
|
60 |
+
|
61 |
+
# multi-gpu params
|
62 |
+
self.tensor_para_size = tensor_para_size
|
63 |
+
self.pipeline_para_size = pipeline_para_size
|
64 |
+
self.build_model = False
|
65 |
+
self.weights_data_type = weights_data_type
|
66 |
+
self.inference_data_type = inference_data_type
|
67 |
+
|
68 |
+
assert torch.cuda.is_available(), "CUDA is required for this model."
|
69 |
+
|
70 |
+
assert head_num % tensor_para_size == 0, "head_num must be a multiple of tensor_para_size."
|
71 |
+
assert layer_num % pipeline_para_size == 0, "layer_num must be a multiple of pipeline_para_size."
|
72 |
+
|
73 |
+
# Load the C++ model into Pytorch model.
|
74 |
+
torch.classes.load_library(os.path.abspath(lib_path))
|
75 |
+
|
76 |
+
# Prepare for tensor/pipeline parallel
|
77 |
+
try:
|
78 |
+
dist.init_process_group(backend='mpi')
|
79 |
+
except:
|
80 |
+
print("[INFO] WARNING: Have initialized the process group")
|
81 |
+
self.rank = dist.get_rank()
|
82 |
+
self.device_count = torch.cuda.device_count()
|
83 |
+
self.device = self.rank % self.device_count
|
84 |
+
torch.cuda.set_device(self.device)
|
85 |
+
|
86 |
+
world_size = dist.get_world_size()
|
87 |
+
# print(tensor_para_size * pipeline_para_size)
|
88 |
+
assert world_size == tensor_para_size * pipeline_para_size, "tensor_para_size * pipeline_para_size must be equal to world_size."
|
89 |
+
|
90 |
+
self.tensor_para_rank = self.rank % self.tensor_para_size
|
91 |
+
self.pipeline_para_rank = self.rank // self.tensor_para_size
|
92 |
+
|
93 |
+
self.model = torch.classes.FasterTransformer.BaichuanOp(
|
94 |
+
self.head_num, self.size_per_head, self.inter_size,
|
95 |
+
self.layer_num,
|
96 |
+
self.vocab_size,
|
97 |
+
self.rotary_embedding_dim,
|
98 |
+
self.layernorm_eps,
|
99 |
+
self.start_id, self.end_id,
|
100 |
+
self.tensor_para_size, self.pipeline_para_size,
|
101 |
+
self.max_seq_len,
|
102 |
+
self.use_gptj_residual,
|
103 |
+
self.memopt_mode,
|
104 |
+
model_path,
|
105 |
+
self.weights_data_type,
|
106 |
+
self.inference_data_type)
|
107 |
+
|
108 |
+
self.build_model = True
|
109 |
+
torch.cuda.empty_cache()
|
110 |
+
|
111 |
+
def forward(self,
|
112 |
+
start_ids: torch.Tensor,
|
113 |
+
start_lengths: torch.Tensor,
|
114 |
+
output_len,
|
115 |
+
beam_width=1,
|
116 |
+
top_k: torch.Tensor = None,
|
117 |
+
top_p: torch.Tensor = None,
|
118 |
+
beam_search_diversity_rate: torch.Tensor = None,
|
119 |
+
temperature: torch.Tensor = None,
|
120 |
+
len_penalty: torch.Tensor = None,
|
121 |
+
repetition_penalty: torch.Tensor = None,
|
122 |
+
random_seed: torch.Tensor = None,
|
123 |
+
return_output_length=False,
|
124 |
+
return_cum_log_probs=0):
|
125 |
+
|
126 |
+
input_len = start_ids.size(1)
|
127 |
+
assert input_len > 0, "input len must be larger than zero. For an unconditional case, use start_id as the first token."
|
128 |
+
|
129 |
+
# Inputs to device
|
130 |
+
input_ids = start_ids.cuda(self.device)
|
131 |
+
input_lengths = start_lengths.cuda(self.device)
|
132 |
+
# outputs: output_ids, output_lengths, output_cum_log_probs (optional)
|
133 |
+
outputs = self.model.forward(input_ids,
|
134 |
+
input_lengths,
|
135 |
+
output_len,
|
136 |
+
beam_width, # optional, can be None
|
137 |
+
top_k, # optional, can be None
|
138 |
+
top_p, # optional, can be None
|
139 |
+
beam_search_diversity_rate, # optional, can be None
|
140 |
+
temperature, # optional, can be None
|
141 |
+
len_penalty, # optional, can be None
|
142 |
+
repetition_penalty, # optional, can be None
|
143 |
+
random_seed, # optional, can be None
|
144 |
+
return_cum_log_probs) # optional, can be None
|
145 |
+
|
146 |
+
if return_cum_log_probs == 0:
|
147 |
+
output_ids, output_lengths = outputs
|
148 |
+
else:
|
149 |
+
output_ids, output_lengths, output_cum_log_probs = outputs
|
150 |
+
if return_output_length:
|
151 |
+
if return_cum_log_probs > 0:
|
152 |
+
return output_ids, output_lengths, output_cum_log_probs
|
153 |
+
else:
|
154 |
+
return output_ids, output_lengths
|
155 |
+
else:
|
156 |
+
return output_ids
|
157 |
+
|
158 |
+
def set_input_tensor(self, input_tensor):
|
159 |
+
"""Set input tensor to be used instead of forward()'s input.
|
160 |
+
|
161 |
+
When doing pipeline parallelism the input from the previous
|
162 |
+
stage comes from communication, not from the input, so the
|
163 |
+
model's forward_step_func won't have it. This function is thus
|
164 |
+
used by internal code to bypass the input provided by the
|
165 |
+
forward_step_func"""
|
166 |
+
self.input_tensor = input_tensor
|
lyra_baichuan/tokenization_baichuan.py
ADDED
@@ -0,0 +1,232 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2023, Baichuan Intelligent Technology. All rights reserved.
|
2 |
+
|
3 |
+
import os
|
4 |
+
from shutil import copyfile
|
5 |
+
from typing import Any, Dict, List, Optional, Tuple
|
6 |
+
|
7 |
+
import sentencepiece as spm
|
8 |
+
from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer
|
9 |
+
from transformers.utils import logging
|
10 |
+
|
11 |
+
|
12 |
+
logger = logging.get_logger(__name__)
|
13 |
+
|
14 |
+
VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model"}
|
15 |
+
|
16 |
+
PRETRAINED_VOCAB_FILES_MAP = {
|
17 |
+
"vocab_file": {},
|
18 |
+
"tokenizer_file": {},
|
19 |
+
}
|
20 |
+
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {}
|
21 |
+
|
22 |
+
|
23 |
+
class BaichuanTokenizer(PreTrainedTokenizer):
|
24 |
+
"""
|
25 |
+
Construct a Baichuan tokenizer. Based on byte-level Byte-Pair-Encoding.
|
26 |
+
|
27 |
+
Args:
|
28 |
+
vocab_file (`str`):
|
29 |
+
Path to the vocabulary file.
|
30 |
+
"""
|
31 |
+
|
32 |
+
vocab_files_names = VOCAB_FILES_NAMES
|
33 |
+
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
|
34 |
+
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
|
35 |
+
model_input_names = ["input_ids", "attention_mask"]
|
36 |
+
|
37 |
+
def __init__(
|
38 |
+
self,
|
39 |
+
vocab_file,
|
40 |
+
unk_token="<unk>",
|
41 |
+
bos_token="<s>",
|
42 |
+
eos_token="</s>",
|
43 |
+
pad_token=None,
|
44 |
+
sp_model_kwargs: Optional[Dict[str, Any]] = None,
|
45 |
+
add_bos_token=True,
|
46 |
+
add_eos_token=False,
|
47 |
+
clean_up_tokenization_spaces=False,
|
48 |
+
**kwargs,
|
49 |
+
):
|
50 |
+
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
|
51 |
+
bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
|
52 |
+
eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
|
53 |
+
unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
|
54 |
+
pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
|
55 |
+
super().__init__(
|
56 |
+
bos_token=bos_token,
|
57 |
+
eos_token=eos_token,
|
58 |
+
unk_token=unk_token,
|
59 |
+
pad_token=pad_token,
|
60 |
+
add_bos_token=add_bos_token,
|
61 |
+
add_eos_token=add_eos_token,
|
62 |
+
sp_model_kwargs=self.sp_model_kwargs,
|
63 |
+
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
|
64 |
+
**kwargs,
|
65 |
+
)
|
66 |
+
self.vocab_file = vocab_file
|
67 |
+
self.add_bos_token = add_bos_token
|
68 |
+
self.add_eos_token = add_eos_token
|
69 |
+
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
|
70 |
+
self.sp_model.Load(vocab_file)
|
71 |
+
|
72 |
+
def __getstate__(self):
|
73 |
+
state = self.__dict__.copy()
|
74 |
+
state["sp_model"] = None
|
75 |
+
return state
|
76 |
+
|
77 |
+
def __setstate__(self, d):
|
78 |
+
self.__dict__ = d
|
79 |
+
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
|
80 |
+
self.sp_model.Load(self.vocab_file)
|
81 |
+
|
82 |
+
@property
|
83 |
+
def vocab_size(self):
|
84 |
+
"""Returns vocab size"""
|
85 |
+
return self.sp_model.get_piece_size()
|
86 |
+
|
87 |
+
def get_vocab(self):
|
88 |
+
"""Returns vocab as a dict"""
|
89 |
+
vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
|
90 |
+
vocab.update(self.added_tokens_encoder)
|
91 |
+
return vocab
|
92 |
+
|
93 |
+
def _tokenize(self, text):
|
94 |
+
"""Returns a tokenized string."""
|
95 |
+
return self.sp_model.encode(text, out_type=str)
|
96 |
+
|
97 |
+
def _convert_token_to_id(self, token):
|
98 |
+
"""Converts a token (str) in an id using the vocab."""
|
99 |
+
return self.sp_model.piece_to_id(token)
|
100 |
+
|
101 |
+
def _convert_id_to_token(self, index):
|
102 |
+
"""Converts an index (integer) in a token (str) using the vocab."""
|
103 |
+
token = self.sp_model.IdToPiece(index)
|
104 |
+
return token
|
105 |
+
|
106 |
+
def convert_tokens_to_string(self, tokens):
|
107 |
+
"""Converts a sequence of tokens (string) in a single string."""
|
108 |
+
current_sub_tokens = []
|
109 |
+
out_string = ""
|
110 |
+
prev_is_special = False
|
111 |
+
for i, token in enumerate(tokens):
|
112 |
+
# make sure that special tokens are not decoded using sentencepiece model
|
113 |
+
if token in self.all_special_tokens:
|
114 |
+
if not prev_is_special and i != 0:
|
115 |
+
out_string += " "
|
116 |
+
out_string += self.sp_model.decode(current_sub_tokens) + token
|
117 |
+
prev_is_special = True
|
118 |
+
current_sub_tokens = []
|
119 |
+
else:
|
120 |
+
current_sub_tokens.append(token)
|
121 |
+
prev_is_special = False
|
122 |
+
out_string += self.sp_model.decode(current_sub_tokens)
|
123 |
+
return out_string
|
124 |
+
|
125 |
+
def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
126 |
+
"""
|
127 |
+
Save the vocabulary and special tokens file to a directory.
|
128 |
+
|
129 |
+
Args:
|
130 |
+
save_directory (`str`):
|
131 |
+
The directory in which to save the vocabulary.
|
132 |
+
|
133 |
+
Returns:
|
134 |
+
`Tuple(str)`: Paths to the files saved.
|
135 |
+
"""
|
136 |
+
if not os.path.isdir(save_directory):
|
137 |
+
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
|
138 |
+
return
|
139 |
+
out_vocab_file = os.path.join(
|
140 |
+
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
|
141 |
+
)
|
142 |
+
|
143 |
+
if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
|
144 |
+
copyfile(self.vocab_file, out_vocab_file)
|
145 |
+
elif not os.path.isfile(self.vocab_file):
|
146 |
+
with open(out_vocab_file, "wb") as fi:
|
147 |
+
content_spiece_model = self.sp_model.serialized_model_proto()
|
148 |
+
fi.write(content_spiece_model)
|
149 |
+
|
150 |
+
return (out_vocab_file,)
|
151 |
+
|
152 |
+
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
|
153 |
+
bos_token_id = [self.bos_token_id] if self.add_bos_token else []
|
154 |
+
eos_token_id = [self.eos_token_id] if self.add_eos_token else []
|
155 |
+
|
156 |
+
output = bos_token_id + token_ids_0 + eos_token_id
|
157 |
+
|
158 |
+
if token_ids_1 is not None:
|
159 |
+
output = output + bos_token_id + token_ids_1 + eos_token_id
|
160 |
+
|
161 |
+
return output
|
162 |
+
|
163 |
+
def get_special_tokens_mask(
|
164 |
+
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
|
165 |
+
) -> List[int]:
|
166 |
+
"""
|
167 |
+
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
|
168 |
+
special tokens using the tokenizer `prepare_for_model` method.
|
169 |
+
|
170 |
+
Args:
|
171 |
+
token_ids_0 (`List[int]`):
|
172 |
+
List of IDs.
|
173 |
+
token_ids_1 (`List[int]`, *optional*):
|
174 |
+
Optional second list of IDs for sequence pairs.
|
175 |
+
already_has_special_tokens (`bool`, *optional*, defaults to `False`):
|
176 |
+
Whether or not the token list is already formatted with special tokens for the model.
|
177 |
+
|
178 |
+
Returns:
|
179 |
+
`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
|
180 |
+
"""
|
181 |
+
if already_has_special_tokens:
|
182 |
+
return super().get_special_tokens_mask(
|
183 |
+
token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
|
184 |
+
)
|
185 |
+
|
186 |
+
bos_token_id = [1] if self.add_bos_token else []
|
187 |
+
eos_token_id = [1] if self.add_eos_token else []
|
188 |
+
|
189 |
+
if token_ids_1 is None:
|
190 |
+
return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id
|
191 |
+
return (
|
192 |
+
bos_token_id
|
193 |
+
+ ([0] * len(token_ids_0))
|
194 |
+
+ eos_token_id
|
195 |
+
+ bos_token_id
|
196 |
+
+ ([0] * len(token_ids_1))
|
197 |
+
+ eos_token_id
|
198 |
+
)
|
199 |
+
|
200 |
+
def create_token_type_ids_from_sequences(
|
201 |
+
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
202 |
+
) -> List[int]:
|
203 |
+
"""
|
204 |
+
Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT
|
205 |
+
sequence pair mask has the following format:
|
206 |
+
|
207 |
+
```
|
208 |
+
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
|
209 |
+
| first sequence | second sequence |
|
210 |
+
```
|
211 |
+
|
212 |
+
if token_ids_1 is None, only returns the first portion of the mask (0s).
|
213 |
+
|
214 |
+
Args:
|
215 |
+
token_ids_0 (`List[int]`):
|
216 |
+
List of ids.
|
217 |
+
token_ids_1 (`List[int]`, *optional*):
|
218 |
+
Optional second list of IDs for sequence pairs.
|
219 |
+
|
220 |
+
Returns:
|
221 |
+
`List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
|
222 |
+
"""
|
223 |
+
bos_token_id = [self.bos_token_id] if self.add_bos_token else []
|
224 |
+
eos_token_id = [self.eos_token_id] if self.add_eos_token else []
|
225 |
+
|
226 |
+
output = [0] * len(bos_token_id + token_ids_0 + eos_token_id)
|
227 |
+
|
228 |
+
if token_ids_1 is not None:
|
229 |
+
output += [1] * len(bos_token_id + token_ids_1 + eos_token_id)
|
230 |
+
|
231 |
+
return output
|
232 |
+
|
requirements.txt
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
transformers
|
2 |
+
numpy
|
3 |
+
setuptools
|
4 |
+
torch
|
5 |
+
bfloat16
|
6 |
+
sentencepiece
|