carsonhxsu commited on
Commit
f50d964
·
1 Parent(s): 4f41ca5
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.so filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ dist/
2
+ *.egg-info/
3
+ __pycache__
4
+ build/
5
+ .vscode
6
+ .idea
README.md CHANGED
@@ -1,3 +1,115 @@
1
  ---
2
  license: mit
 
 
 
 
 
 
 
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  license: mit
3
+ language: en
4
+ tags:
5
+ - LLM
6
+ - Baichuan-7B
7
+ - Baichuan-13B
8
+ - Baichuan2-7B
9
+ - Baichuan2-13B
10
  ---
11
+ ## Model Card for lyraBaichuan
12
+
13
+ lyraBaichuan is currently the **fastest Baichuan models** (Baichuan-7B, Baichuan-13B, Baichuan2-7B, Baichuan2-13B) available. The inference speed of lyraBaichuan has achieved up to **4300+ tokens/s** on A100, up to **2.4x** acceleration upon the torch version.
14
+
15
+ Among its main features are:
16
+ - device: Nvidia GPU with Amperer architecture or Volta architecture (A100 or higher, V100).
17
+ - batch_size: compiled with dynamic batch size, maximum depends on device. 
18
+ - MEMOPT mode: significantly optimized VRAM usage and increased speed
19
+
20
+ We use the Baichuan2-7B-Base and Baichuan2-13B-Base model for measurement, but this optimized inference is also applicable to other Baichuan models, including Baichuan-7B and Baichuan-13B.
21
+
22
+ ## Speed
23
+
24
+ * Evaluated at tokens/s
25
+ * test on A100 40G
26
+ * MEMOPT mode
27
+
28
+ ### Baichuan2-7B-Base
29
+
30
+ | Version | Batch Size 1 | Batch Size 8 | Batch Size 16 | Batch Size 32 | Batch Size 64 |
31
+ | --- | --- | --- | --- | --- | --- |
32
+ | Torch 2.0.1 | 41.2 | 323.2 | 640.0 | 1256.8 | 2231.0 |
33
+ | lyraXVERSE MEMOPT | 125.9 | 948.1 | 1749.3 | 2974.0 | 4370.1 |
34
+
35
+ ### Baichuan2-13B-Base
36
+
37
+ | Version | Batch Size 1 | Batch Size 8 | Batch Size 16 | Batch Size 32 | Batch Size 64 |
38
+ | --- | --- | --- | --- | --- | --- |
39
+ | Torch 2.0.1 | 40.9 | 307.9 | 555.6 | 1010.4 | 1601.0 |
40
+ | lyraXVERSE MEMOPT | 80.0 | 568.2 | 1124.4 | 1942.6 | 2828.0 |
41
+
42
+ ## Docker Environment Recommendation
43
+
44
+ - For Cuda 11.X: we recommend ```nvcr.io/nvidia/pytorch:22.12-py3```
45
+ - For Cuda 12.0: we recommend ```nvcr.io/nvidia/pytorch:23.02-py3```
46
+
47
+ ```bash
48
+ docker pull nvcr.io/nvidia/pytorch:23.02-py3
49
+ docker run --rm -it --gpus all -v ./:/lyraBaichuan nvcr.io/nvidia/pytorch:23.02-py3
50
+
51
+ pip install -r requirements.txt
52
+ python demo.py
53
+ ```
54
+
55
+ ## Uses
56
+
57
+ ```python
58
+ from lyra_baichuan import lyraBaichuan7B, lyraBaichuan13B
59
+
60
+ model_path = "./models/Baichuan2-13B-lyra"
61
+ tokenizer_path = "./models/Baichuan2-13B-lyra"
62
+ inference_dtype = 'fp16'
63
+ prompt = "登鹳雀楼->王之涣\n夜雨寄北->"
64
+
65
+ memopt_mode = 1
66
+ max_output_length = 64
67
+ arch = "Ampere" # Ampere or Volta
68
+ cuda_version = 12 # cuda version, we currently support 11 and 12
69
+
70
+ # To use 7B model, initialize with lyraBaichuan7B
71
+ model = lyraBaichuan13B(model_path,
72
+ tokenizer_path = tokenizer_path,
73
+ dtype = inference_dtype,
74
+ memopt_mode = memopt_mode,
75
+ arch = arch,
76
+ cuda_version = cuda_version)
77
+
78
+ bs = 1
79
+ prompts = [prompt, ] * bs
80
+ output_texts = model.generate(
81
+ prompts, output_length=max_output_length,
82
+ top_k=30, top_p=0.85, temperature=1.0, repetition_penalty=1.0, do_sample=False)
83
+
84
+ print(output_texts)
85
+ ```
86
+
87
+ ## Demo Outputs
88
+
89
+ ### Baichuan2-13B-Base
90
+ #### input
91
+
92
+ 登鹳雀楼->王之涣
93
+
94
+ 夜雨寄北->
95
+
96
+ #### output
97
+
98
+ ## TODO
99
+ 1. Support for int4
100
+ 2. Inference for longer context situations
101
+ 3. Streaming inference mode.
102
+
103
+ ## Citation
104
+ ``` bibtex
105
+ @Misc{lyraBaichuan2023,
106
+   author =       {Haoxiong Su, Kangjian Wu, Zhengtao Wang, Yibo Lu, Bin Wu},
107
+   title =        {lyraBaichuan: Accelerating Baichuan models to 4300+ tokens/s},
108
+   howpublished = {\url{https://huggingface.co/TMElyralab/lyraBaichuan}},
109
+   year =         {2023}
110
+ }
111
+ ```
112
+
113
+ ## Report bug
114
+ - start a discussion to report any bugs!--> https://huggingface.co/TMElyralab/lyraBaichuan
115
+ - report bug with a `[bug]` mark in the title.
demo.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from lyra_baichuan import lyraBaichuan7B, lyraBaichuan13B
2
+
3
+ model_path = "./models/Baichuan2-13B-lyra"
4
+ tokenizer_path = "./models/Baichuan2-13B-lyra"
5
+ inference_dtype = 'fp16'
6
+ prompt = "登鹳雀楼->王之涣\n夜雨寄北->"
7
+
8
+ memopt_mode = 1
9
+ max_output_length = 64
10
+ arch = "Ampere" # Ampere or Volta
11
+ cuda_version = 12 # cuda version, we currently support 11 and 12
12
+
13
+ model = lyraBaichuan13B(model_path,
14
+ tokenizer_path = tokenizer_path,
15
+ dtype = inference_dtype,
16
+ memopt_mode = memopt_mode,
17
+ arch = arch,
18
+ cuda_version = cuda_version)
19
+
20
+ bs = 1
21
+ prompts = [prompt, ] * bs
22
+ output_texts = model.generate(
23
+ prompts, output_length=max_output_length,
24
+ top_k=30, top_p=0.85, temperature=1.0, repetition_penalty=1.0, do_sample=False)
25
+
26
+ print(output_texts)
lyra_baichuan/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from .lyra_baichuan import lyraBaichuan7B, lyraBaichuan13B
lyra_baichuan/config.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import dataclasses
2
+ from typing import Optional
3
+
4
+
5
+ @dataclasses.dataclass
6
+ class LyraBaichuanParam:
7
+ num_heads: int = 40
8
+ size_per_head: int = 128
9
+ inter_size: int = 13824
10
+ num_layers: int = 40
11
+ vocab_size: int = 39424
12
+ start_id: Optional[int] = 1
13
+ end_id: Optional[int] = 2
14
+ tensor_para_size: int = 1
15
+ pipeline_para_size: int = 1
16
+ remove_padding: bool = True
17
+ shared_contexts_ratio: float = 1.0
18
+ layernorm_eps: float = 1e-6
19
+ weights_data_type: str = "fp16"
20
+ rotary_embedding: int = 128
21
+ use_gptj_residual: bool = False
22
+
23
+ def __post_init__(self):
24
+ if not 0.0 <= self.shared_contexts_ratio <= 1.0:
25
+ raise ValueError(
26
+ f'Got an invalid value of shared_context_ratio '
27
+ f'{self.shared_contexts_ratio} - range: [0.0, 1.0]')
28
+
29
+ def asdict(self):
30
+ return dataclasses.asdict(self)
31
+
32
+
33
+ LYRA_BAICHUAN_PARAM = LyraBaichuanParam()
34
+ LIB_SO_PATH = '/usr/lib/ftlib/libth_transformer.so'
lyra_baichuan/lyra_baichuan.py ADDED
@@ -0,0 +1,367 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import configparser
4
+ import pathlib
5
+ import typing
6
+ import os
7
+
8
+ import torch
9
+ import transformers
10
+ from torch.nn.utils.rnn import pad_sequence
11
+
12
+ from .config import LYRA_BAICHUAN_PARAM, LIB_SO_PATH
13
+ from .model import BaichuanModel
14
+ from .tokenization_baichuan import BaichuanTokenizer
15
+
16
+ class lyraBaichuan7B:
17
+ def __init__(self, model_path, tokenizer_path=None, dtype='fp16', memopt_mode=0, arch='Ampere', cuda_version=12) -> None:
18
+ self.model_path = model_path
19
+ self.tokenizer_path = tokenizer_path
20
+ self.dtype = dtype
21
+ self.memopt_mode = memopt_mode
22
+ self.arch = arch
23
+ self.cuda_version = cuda_version
24
+
25
+ self.model, self.tokenizer = self.load_model_and_tokenizer()
26
+ print("Got model and tokenizer")
27
+
28
+ def load_model_and_tokenizer(self):
29
+ if self.tokenizer_path is None:
30
+ tokenizer_path = self.model_path
31
+ else:
32
+ tokenizer_path = self.tokenizer_path
33
+
34
+ print(f'Loading tokenizer from {tokenizer_path}')
35
+ tokenizer = BaichuanTokenizer.from_pretrained(tokenizer_path)
36
+
37
+ checkpoint_path = pathlib.Path(self.model_path)
38
+ config_path = checkpoint_path / 'config.ini'
39
+
40
+ if config_path.exists():
41
+ # Read model params from config.
42
+ cfg = configparser.ConfigParser()
43
+ cfg.read(config_path)
44
+ model_name = 'baichuan'
45
+ inference_data_type = self.dtype
46
+ if inference_data_type == None:
47
+ inference_data_type = cfg.get(model_name, "weight_data_type")
48
+ model_args = dict(
49
+ head_num=cfg.getint(model_name, 'head_num'),
50
+ size_per_head=cfg.getint(model_name, "size_per_head"),
51
+ inter_size=cfg.getint(model_name, 'inter_size'),
52
+ layer_num=cfg.getint(model_name, "num_layer"),
53
+ rotary_embedding_dim=cfg.getint(model_name, 'rotary_embedding'),
54
+ layernorm_eps=cfg.getfloat(model_name, 'layernorm_eps'),
55
+ vocab_size=cfg.getint(model_name, "vocab_size"),
56
+ start_id=cfg.getint(model_name, "start_id"),
57
+ end_id=cfg.getint(model_name, "end_id"),
58
+ weights_data_type=cfg.get(model_name, "weight_data_type"),
59
+ tensor_para_size=cfg.getint(model_name, "tensor_para_size"),
60
+ inference_data_type=inference_data_type)
61
+ else:
62
+ inference_data_type = self.dtype
63
+ if inference_data_type == None:
64
+ inference_data_type = LYRA_BAICHUAN_PARAM.weights_data_type
65
+ model_args = dict(head_num=LYRA_BAICHUAN_PARAM.num_heads,
66
+ size_per_head=LYRA_BAICHUAN_PARAM.size_per_head,
67
+ inter_size=LYRA_BAICHUAN_PARAM.inter_size,
68
+ layer_num=LYRA_BAICHUAN_PARAM.num_layers,
69
+ rotary_embedding_dim=LYRA_BAICHUAN_PARAM.rotary_embedding,
70
+ layernorm_eps=LYRA_BAICHUAN_PARAM.layernorm_eps,
71
+ vocab_size=LYRA_BAICHUAN_PARAM.vocab_size,
72
+ start_id=LYRA_BAICHUAN_PARAM.start_id or tokenizer.bos_token_id,
73
+ end_id=LYRA_BAICHUAN_PARAM.end_id or tokenizer.eos_token_id,
74
+ weights_data_type=LYRA_BAICHUAN_PARAM.weights_data_type,
75
+ tensor_para_size=LYRA_BAICHUAN_PARAM.tensor_para_size,
76
+ inference_data_type=inference_data_type)
77
+
78
+ # update common parameters
79
+
80
+ # Load the C++ model into Pytorch model.
81
+ sm = "sm80"
82
+
83
+ if self.arch == "Ampere":
84
+ sm = "sm80"
85
+ elif self.arch == "Volta":
86
+ sm = "sm70"
87
+ else:
88
+ raise Exception(f"unsupported arch: {self.arch}")
89
+
90
+ cu = 'cu11'
91
+ if self.cuda_version == 11:
92
+ cu = 'cu11'
93
+ elif self.cuda_version == 12:
94
+ cu = 'cu12'
95
+ else:
96
+ raise Exception(f"unsupported cuda version: {self.cuda_version}")
97
+
98
+ lib_path = pathlib.Path(__file__).parent / "ftlib" / f"libth_transformer_{sm}_{cu}.so"
99
+
100
+ model_args.update(dict(
101
+ lib_path=lib_path,
102
+ model_path=os.path.join(self.model_path, "1-gpu-fp16.bin"),
103
+ max_seq_len=0, # for position seq embedding
104
+ pipeline_para_size=LYRA_BAICHUAN_PARAM.pipeline_para_size,
105
+ use_gptj_residual=LYRA_BAICHUAN_PARAM.use_gptj_residual,
106
+ memopt_mode=self.memopt_mode
107
+ ))
108
+
109
+ print('[FT][INFO] Load Our FT Highly Optimized Baichuan-7B model')
110
+ for k, v in model_args.items():
111
+ print(f' - {k.ljust(25, ".")}: {v}')
112
+
113
+ # Check sanity and consistency between the model and tokenizer.
114
+ checklist = ['head_num', 'size_per_head', 'vocab_size', 'layer_num',
115
+ 'tensor_para_size', 'tensor_para_size', 'weights_data_type']
116
+ if None in [model_args[k] for k in checklist]:
117
+ none_params = [p for p in checklist if model_args[p] is None]
118
+ print(f'[FT][WARNING] Found None parameters {none_params}. They must '
119
+ f'be provided either by config file or CLI arguments.')
120
+ if model_args['start_id'] != tokenizer.bos_token_id:
121
+ print('[FT][WARNING] Given start_id is not matched with the bos token '
122
+ 'id of the pretrained tokenizer.')
123
+ if model_args['end_id'] not in (tokenizer.pad_token_id, tokenizer.eos_token_id):
124
+ print('[FT][WARNING] Given end_id is not matched with neither pad '
125
+ 'token id nor eos token id of the pretrained tokenizer.')
126
+
127
+ print(f'Loading model from {self.model_path}')
128
+ model = BaichuanModel(**model_args)
129
+ return model, tokenizer
130
+
131
+ def generate(self, prompts: typing.List[str] | str,
132
+ output_length: int = 512,
133
+ beam_width: int = 1,
134
+ top_k: typing.Optional[torch.IntTensor] = 1,
135
+ top_p: typing.Optional[torch.FloatTensor] = 1.0,
136
+ beam_search_diversity_rate: typing.Optional[torch.FloatTensor] = 0.0,
137
+ temperature: typing.Optional[torch.FloatTensor] = 1.0,
138
+ len_penalty: typing.Optional[torch.FloatTensor] = 0.0,
139
+ repetition_penalty: typing.Optional[torch.FloatTensor] = 1.0,
140
+ presence_penalty: typing.Optional[torch.FloatTensor] = None,
141
+ min_length: typing.Optional[torch.IntTensor] = None,
142
+ bad_words_list: typing.Optional[torch.IntTensor] = None,
143
+ do_sample: bool = False,
144
+ return_output_length: bool = False,
145
+ return_cum_log_probs: int = 0):
146
+ #
147
+ if isinstance(prompts, str):
148
+ prompts = [prompts, ]
149
+
150
+ inputs = prompts
151
+
152
+ batch_size = len(inputs)
153
+ ones_int = torch.ones(size=[batch_size], dtype=torch.int32)
154
+ ones_float = torch.ones(size=[batch_size], dtype=torch.float32)
155
+
156
+ # we must encode the raw prompt text one by one in order to compute the length of the original text.
157
+ input_token_ids = [self.tokenizer(text, return_tensors="pt").input_ids.int().squeeze() for text in inputs]
158
+ input_lengths = torch.IntTensor([len(ids) for ids in input_token_ids])
159
+ # after got the length of each input text tokens. we can batchfy the input list to a tensor. padding the right.
160
+ input_token_ids = pad_sequence(input_token_ids, batch_first=True, padding_value=self.tokenizer.eos_token_id)
161
+
162
+ random_seed = None
163
+ if do_sample:
164
+ random_seed = torch.randint(0, 262144, (batch_size,), dtype=torch.long)
165
+
166
+ outputs = self.model(start_ids=input_token_ids,
167
+ start_lengths=input_lengths,
168
+ output_len=output_length,
169
+ beam_width=beam_width,
170
+ top_k=top_k * ones_int,
171
+ top_p=top_p * ones_float,
172
+ beam_search_diversity_rate=beam_search_diversity_rate * ones_float,
173
+ temperature=temperature * ones_float,
174
+ len_penalty=len_penalty * ones_float,
175
+ repetition_penalty=repetition_penalty * ones_float,
176
+ random_seed=random_seed,
177
+ return_output_length=return_output_length,
178
+ return_cum_log_probs=return_cum_log_probs)
179
+
180
+ if return_cum_log_probs > 0:
181
+ outputs = outputs[0] # output_token_ids.
182
+
183
+ # Slice the generated token ids of the 1st beam result.
184
+ # output = input tokens + generated tokens.
185
+ output_token_ids = [out[0, length:].cpu()
186
+ for out, length in zip(outputs, input_lengths)]
187
+
188
+ output_texts = self.tokenizer.batch_decode(
189
+ output_token_ids, skip_special_tokens=True)
190
+
191
+ return output_texts
192
+
193
+ class lyraBaichuan13B:
194
+ def __init__(self, model_path, tokenizer_path=None, dtype='fp16', memopt_mode=0, arch='Ampere', cuda_version=12) -> None:
195
+ self.model_path = model_path
196
+ self.tokenizer_path = tokenizer_path
197
+ self.dtype = dtype
198
+ self.memopt_mode = memopt_mode
199
+ self.arch = arch
200
+ self.cuda_version = cuda_version
201
+
202
+ self.model, self.tokenizer = self.load_model_and_tokenizer()
203
+ print("Got model and tokenizer")
204
+
205
+ def load_model_and_tokenizer(self):
206
+ if self.tokenizer_path is None:
207
+ tokenizer_path = self.model_path
208
+ else:
209
+ tokenizer_path = self.tokenizer_path
210
+
211
+ print(f'Loading tokenizer from {tokenizer_path}')
212
+ tokenizer = BaichuanTokenizer.from_pretrained(tokenizer_path)
213
+
214
+ checkpoint_path = pathlib.Path(self.model_path)
215
+ config_path = checkpoint_path / 'config.ini'
216
+
217
+ if config_path.exists():
218
+ # Read model params from config.
219
+ cfg = configparser.ConfigParser()
220
+ cfg.read(config_path)
221
+ model_name = 'baichuan'
222
+ inference_data_type = self.dtype
223
+ if inference_data_type == None:
224
+ inference_data_type = cfg.get(model_name, "weight_data_type")
225
+ model_args = dict(
226
+ head_num=cfg.getint(model_name, 'head_num'),
227
+ size_per_head=cfg.getint(model_name, "size_per_head"),
228
+ inter_size=cfg.getint(model_name, 'inter_size'),
229
+ layer_num=cfg.getint(model_name, "num_layer"),
230
+ rotary_embedding_dim=0,
231
+ layernorm_eps=cfg.getfloat(model_name, 'layernorm_eps'),
232
+ vocab_size=cfg.getint(model_name, "vocab_size"),
233
+ start_id=cfg.getint(model_name, "start_id"),
234
+ end_id=cfg.getint(model_name, "end_id"),
235
+ weights_data_type=cfg.get(model_name, "weight_data_type"),
236
+ tensor_para_size=cfg.getint(model_name, "tensor_para_size"),
237
+ inference_data_type=inference_data_type)
238
+ else:
239
+ inference_data_type = self.dtype
240
+ if inference_data_type == None:
241
+ inference_data_type = LYRA_BAICHUAN_PARAM.weights_data_type
242
+ model_args = dict(head_num=LYRA_BAICHUAN_PARAM.num_heads,
243
+ size_per_head=LYRA_BAICHUAN_PARAM.size_per_head,
244
+ inter_size=LYRA_BAICHUAN_PARAM.inter_size,
245
+ layer_num=LYRA_BAICHUAN_PARAM.num_layers,
246
+ rotary_embedding_dim=0,
247
+ layernorm_eps=LYRA_BAICHUAN_PARAM.layernorm_eps,
248
+ vocab_size=LYRA_BAICHUAN_PARAM.vocab_size,
249
+ start_id=LYRA_BAICHUAN_PARAM.start_id or tokenizer.bos_token_id,
250
+ end_id=LYRA_BAICHUAN_PARAM.end_id or tokenizer.eos_token_id,
251
+ weights_data_type=LYRA_BAICHUAN_PARAM.weights_data_type,
252
+ tensor_para_size=LYRA_BAICHUAN_PARAM.tensor_para_size,
253
+ inference_data_type=inference_data_type)
254
+
255
+ # update common parameters
256
+ # Load the C++ model into Pytorch model.
257
+ sm = "sm80"
258
+
259
+ if self.arch == "Ampere":
260
+ sm = "sm80"
261
+ elif self.arch == "Volta":
262
+ sm = "sm70"
263
+ else:
264
+ raise Exception(f"unsupported arch: {self.arch}")
265
+
266
+ cu = 'cu11'
267
+ if self.cuda_version == 11:
268
+ cu = 'cu11'
269
+ elif self.cuda_version == 12:
270
+ cu = 'cu12'
271
+ else:
272
+ raise Exception(f"unsupported cuda version: {self.cuda_version}")
273
+
274
+ lib_path = pathlib.Path(__file__).parent / "ftlib" / f"libth_transformer_{sm}_{cu}.so"
275
+ model_args.update(dict(
276
+ lib_path=lib_path,
277
+ model_path=os.path.join(self.model_path, "1-gpu-fp16.bin"),
278
+ max_seq_len=0, # for position seq embedding
279
+ pipeline_para_size=LYRA_BAICHUAN_PARAM.pipeline_para_size,
280
+ use_gptj_residual=LYRA_BAICHUAN_PARAM.use_gptj_residual,
281
+ memopt_mode=self.memopt_mode
282
+ ))
283
+
284
+ print('[FT][INFO] Load Our FT Highly Optimized Baichuan-13B model')
285
+ for k, v in model_args.items():
286
+ print(f' - {k.ljust(25, ".")}: {v}')
287
+
288
+ # Check sanity and consistency between the model and tokenizer.
289
+ checklist = ['head_num', 'size_per_head', 'vocab_size', 'layer_num',
290
+ 'tensor_para_size', 'tensor_para_size', 'weights_data_type']
291
+ if None in [model_args[k] for k in checklist]:
292
+ none_params = [p for p in checklist if model_args[p] is None]
293
+ print(f'[FT][WARNING] Found None parameters {none_params}. They must '
294
+ f'be provided either by config file or CLI arguments.')
295
+ if model_args['start_id'] != tokenizer.bos_token_id:
296
+ print('[FT][WARNING] Given start_id is not matched with the bos token '
297
+ 'id of the pretrained tokenizer.')
298
+ if model_args['end_id'] not in (tokenizer.pad_token_id, tokenizer.eos_token_id):
299
+ print('[FT][WARNING] Given end_id is not matched with neither pad '
300
+ 'token id nor eos token id of the pretrained tokenizer.')
301
+
302
+ print(f'Loading model from {self.model_path}')
303
+ model = BaichuanModel(**model_args)
304
+ return model, tokenizer
305
+
306
+ def generate(self, prompts: typing.List[str] | str,
307
+ output_length: int = 512,
308
+ beam_width: int = 1,
309
+ top_k: typing.Optional[torch.IntTensor] = 1,
310
+ top_p: typing.Optional[torch.FloatTensor] = 1.0,
311
+ beam_search_diversity_rate: typing.Optional[torch.FloatTensor] = 0.0,
312
+ temperature: typing.Optional[torch.FloatTensor] = 1.0,
313
+ len_penalty: typing.Optional[torch.FloatTensor] = 0.0,
314
+ repetition_penalty: typing.Optional[torch.FloatTensor] = 1.0,
315
+ presence_penalty: typing.Optional[torch.FloatTensor] = None,
316
+ min_length: typing.Optional[torch.IntTensor] = None,
317
+ bad_words_list: typing.Optional[torch.IntTensor] = None,
318
+ do_sample: bool = False,
319
+ return_output_length: bool = False,
320
+ return_cum_log_probs: int = 0):
321
+ #
322
+ if isinstance(prompts, str):
323
+ prompts = [prompts, ]
324
+
325
+ inputs = prompts
326
+
327
+ batch_size = len(inputs)
328
+ ones_int = torch.ones(size=[batch_size], dtype=torch.int32)
329
+ ones_float = torch.ones(size=[batch_size], dtype=torch.float32)
330
+
331
+ # we must encode the raw prompt text one by one in order to compute the length of the original text.
332
+ input_token_ids = [self.tokenizer(text, return_tensors="pt").input_ids.int().squeeze() for text in inputs]
333
+ input_lengths = torch.IntTensor([len(ids) for ids in input_token_ids])
334
+ # after got the length of each input text tokens. we can batchfy the input list to a tensor. padding the right.
335
+ input_token_ids = pad_sequence(input_token_ids, batch_first=True, padding_value=self.tokenizer.eos_token_id)
336
+
337
+ random_seed = None
338
+ if do_sample:
339
+ random_seed = torch.randint(0, 262144, (batch_size,), dtype=torch.long)
340
+
341
+ outputs = self.model(start_ids=input_token_ids,
342
+ start_lengths=input_lengths,
343
+ output_len=output_length,
344
+ beam_width=beam_width,
345
+ top_k=top_k * ones_int,
346
+ top_p=top_p * ones_float,
347
+ beam_search_diversity_rate=beam_search_diversity_rate * ones_float,
348
+ temperature=temperature * ones_float,
349
+ len_penalty=len_penalty * ones_float,
350
+ repetition_penalty=repetition_penalty * ones_float,
351
+ random_seed=random_seed,
352
+ return_output_length=return_output_length,
353
+ return_cum_log_probs=return_cum_log_probs)
354
+
355
+ if return_cum_log_probs > 0:
356
+ outputs = outputs[0] # output_token_ids.
357
+
358
+ # Slice the generated token ids of the 1st beam result.
359
+ # output = input tokens + generated tokens.
360
+ output_token_ids = [out[0, length:].cpu()
361
+ for out, length in zip(outputs, input_lengths)]
362
+
363
+ output_texts = self.tokenizer.batch_decode(
364
+ output_token_ids, skip_special_tokens=True)
365
+
366
+ return output_texts
367
+
lyra_baichuan/model.py ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from __future__ import print_function
16
+
17
+ import copy
18
+ import os
19
+ import pathlib
20
+ import typing
21
+
22
+ import numpy as np
23
+ import torch
24
+ import torch.distributed as dist
25
+ import torch.nn as nn
26
+
27
+ str_type_map = {"fp32": torch.float32, "fp16": torch.float16, "bf16": torch.bfloat16}
28
+
29
+ class BaichuanModel(nn.Module):
30
+ def __init__(self,
31
+ head_num,
32
+ size_per_head,
33
+ inter_size,
34
+ vocab_size,
35
+ rotary_embedding_dim,
36
+ start_id, end_id, layer_num,
37
+ max_seq_len: int,
38
+ layernorm_eps,
39
+ tensor_para_size: int,
40
+ pipeline_para_size: int,
41
+ use_gptj_residual,
42
+ lib_path: typing.Union[str, pathlib.Path],
43
+ model_path,
44
+ memopt_mode: int = 0,
45
+ inference_data_type: str = "fp16",
46
+ weights_data_type: typing.Union[str, np.dtype] = np.float32):
47
+ super().__init__()
48
+ self.head_num = head_num
49
+ self.size_per_head = size_per_head
50
+ self.inter_size = inter_size
51
+ self.vocab_size = vocab_size
52
+ self.rotary_embedding_dim = rotary_embedding_dim
53
+ self.start_id = start_id
54
+ self.end_id = end_id
55
+ self.max_seq_len = max_seq_len
56
+ self.layer_num = layer_num
57
+ self.use_gptj_residual = use_gptj_residual
58
+ self.layernorm_eps = layernorm_eps
59
+ self.memopt_mode = memopt_mode
60
+
61
+ # multi-gpu params
62
+ self.tensor_para_size = tensor_para_size
63
+ self.pipeline_para_size = pipeline_para_size
64
+ self.build_model = False
65
+ self.weights_data_type = weights_data_type
66
+ self.inference_data_type = inference_data_type
67
+
68
+ assert torch.cuda.is_available(), "CUDA is required for this model."
69
+
70
+ assert head_num % tensor_para_size == 0, "head_num must be a multiple of tensor_para_size."
71
+ assert layer_num % pipeline_para_size == 0, "layer_num must be a multiple of pipeline_para_size."
72
+
73
+ # Load the C++ model into Pytorch model.
74
+ torch.classes.load_library(os.path.abspath(lib_path))
75
+
76
+ # Prepare for tensor/pipeline parallel
77
+ try:
78
+ dist.init_process_group(backend='mpi')
79
+ except:
80
+ print("[INFO] WARNING: Have initialized the process group")
81
+ self.rank = dist.get_rank()
82
+ self.device_count = torch.cuda.device_count()
83
+ self.device = self.rank % self.device_count
84
+ torch.cuda.set_device(self.device)
85
+
86
+ world_size = dist.get_world_size()
87
+ # print(tensor_para_size * pipeline_para_size)
88
+ assert world_size == tensor_para_size * pipeline_para_size, "tensor_para_size * pipeline_para_size must be equal to world_size."
89
+
90
+ self.tensor_para_rank = self.rank % self.tensor_para_size
91
+ self.pipeline_para_rank = self.rank // self.tensor_para_size
92
+
93
+ self.model = torch.classes.FasterTransformer.BaichuanOp(
94
+ self.head_num, self.size_per_head, self.inter_size,
95
+ self.layer_num,
96
+ self.vocab_size,
97
+ self.rotary_embedding_dim,
98
+ self.layernorm_eps,
99
+ self.start_id, self.end_id,
100
+ self.tensor_para_size, self.pipeline_para_size,
101
+ self.max_seq_len,
102
+ self.use_gptj_residual,
103
+ self.memopt_mode,
104
+ model_path,
105
+ self.weights_data_type,
106
+ self.inference_data_type)
107
+
108
+ self.build_model = True
109
+ torch.cuda.empty_cache()
110
+
111
+ def forward(self,
112
+ start_ids: torch.Tensor,
113
+ start_lengths: torch.Tensor,
114
+ output_len,
115
+ beam_width=1,
116
+ top_k: torch.Tensor = None,
117
+ top_p: torch.Tensor = None,
118
+ beam_search_diversity_rate: torch.Tensor = None,
119
+ temperature: torch.Tensor = None,
120
+ len_penalty: torch.Tensor = None,
121
+ repetition_penalty: torch.Tensor = None,
122
+ random_seed: torch.Tensor = None,
123
+ return_output_length=False,
124
+ return_cum_log_probs=0):
125
+
126
+ input_len = start_ids.size(1)
127
+ assert input_len > 0, "input len must be larger than zero. For an unconditional case, use start_id as the first token."
128
+
129
+ # Inputs to device
130
+ input_ids = start_ids.cuda(self.device)
131
+ input_lengths = start_lengths.cuda(self.device)
132
+ # outputs: output_ids, output_lengths, output_cum_log_probs (optional)
133
+ outputs = self.model.forward(input_ids,
134
+ input_lengths,
135
+ output_len,
136
+ beam_width, # optional, can be None
137
+ top_k, # optional, can be None
138
+ top_p, # optional, can be None
139
+ beam_search_diversity_rate, # optional, can be None
140
+ temperature, # optional, can be None
141
+ len_penalty, # optional, can be None
142
+ repetition_penalty, # optional, can be None
143
+ random_seed, # optional, can be None
144
+ return_cum_log_probs) # optional, can be None
145
+
146
+ if return_cum_log_probs == 0:
147
+ output_ids, output_lengths = outputs
148
+ else:
149
+ output_ids, output_lengths, output_cum_log_probs = outputs
150
+ if return_output_length:
151
+ if return_cum_log_probs > 0:
152
+ return output_ids, output_lengths, output_cum_log_probs
153
+ else:
154
+ return output_ids, output_lengths
155
+ else:
156
+ return output_ids
157
+
158
+ def set_input_tensor(self, input_tensor):
159
+ """Set input tensor to be used instead of forward()'s input.
160
+
161
+ When doing pipeline parallelism the input from the previous
162
+ stage comes from communication, not from the input, so the
163
+ model's forward_step_func won't have it. This function is thus
164
+ used by internal code to bypass the input provided by the
165
+ forward_step_func"""
166
+ self.input_tensor = input_tensor
lyra_baichuan/tokenization_baichuan.py ADDED
@@ -0,0 +1,232 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023, Baichuan Intelligent Technology. All rights reserved.
2
+
3
+ import os
4
+ from shutil import copyfile
5
+ from typing import Any, Dict, List, Optional, Tuple
6
+
7
+ import sentencepiece as spm
8
+ from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer
9
+ from transformers.utils import logging
10
+
11
+
12
+ logger = logging.get_logger(__name__)
13
+
14
+ VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model"}
15
+
16
+ PRETRAINED_VOCAB_FILES_MAP = {
17
+ "vocab_file": {},
18
+ "tokenizer_file": {},
19
+ }
20
+ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {}
21
+
22
+
23
+ class BaichuanTokenizer(PreTrainedTokenizer):
24
+ """
25
+ Construct a Baichuan tokenizer. Based on byte-level Byte-Pair-Encoding.
26
+
27
+ Args:
28
+ vocab_file (`str`):
29
+ Path to the vocabulary file.
30
+ """
31
+
32
+ vocab_files_names = VOCAB_FILES_NAMES
33
+ pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
34
+ max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
35
+ model_input_names = ["input_ids", "attention_mask"]
36
+
37
+ def __init__(
38
+ self,
39
+ vocab_file,
40
+ unk_token="<unk>",
41
+ bos_token="<s>",
42
+ eos_token="</s>",
43
+ pad_token=None,
44
+ sp_model_kwargs: Optional[Dict[str, Any]] = None,
45
+ add_bos_token=True,
46
+ add_eos_token=False,
47
+ clean_up_tokenization_spaces=False,
48
+ **kwargs,
49
+ ):
50
+ self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
51
+ bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
52
+ eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
53
+ unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
54
+ pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
55
+ super().__init__(
56
+ bos_token=bos_token,
57
+ eos_token=eos_token,
58
+ unk_token=unk_token,
59
+ pad_token=pad_token,
60
+ add_bos_token=add_bos_token,
61
+ add_eos_token=add_eos_token,
62
+ sp_model_kwargs=self.sp_model_kwargs,
63
+ clean_up_tokenization_spaces=clean_up_tokenization_spaces,
64
+ **kwargs,
65
+ )
66
+ self.vocab_file = vocab_file
67
+ self.add_bos_token = add_bos_token
68
+ self.add_eos_token = add_eos_token
69
+ self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
70
+ self.sp_model.Load(vocab_file)
71
+
72
+ def __getstate__(self):
73
+ state = self.__dict__.copy()
74
+ state["sp_model"] = None
75
+ return state
76
+
77
+ def __setstate__(self, d):
78
+ self.__dict__ = d
79
+ self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
80
+ self.sp_model.Load(self.vocab_file)
81
+
82
+ @property
83
+ def vocab_size(self):
84
+ """Returns vocab size"""
85
+ return self.sp_model.get_piece_size()
86
+
87
+ def get_vocab(self):
88
+ """Returns vocab as a dict"""
89
+ vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
90
+ vocab.update(self.added_tokens_encoder)
91
+ return vocab
92
+
93
+ def _tokenize(self, text):
94
+ """Returns a tokenized string."""
95
+ return self.sp_model.encode(text, out_type=str)
96
+
97
+ def _convert_token_to_id(self, token):
98
+ """Converts a token (str) in an id using the vocab."""
99
+ return self.sp_model.piece_to_id(token)
100
+
101
+ def _convert_id_to_token(self, index):
102
+ """Converts an index (integer) in a token (str) using the vocab."""
103
+ token = self.sp_model.IdToPiece(index)
104
+ return token
105
+
106
+ def convert_tokens_to_string(self, tokens):
107
+ """Converts a sequence of tokens (string) in a single string."""
108
+ current_sub_tokens = []
109
+ out_string = ""
110
+ prev_is_special = False
111
+ for i, token in enumerate(tokens):
112
+ # make sure that special tokens are not decoded using sentencepiece model
113
+ if token in self.all_special_tokens:
114
+ if not prev_is_special and i != 0:
115
+ out_string += " "
116
+ out_string += self.sp_model.decode(current_sub_tokens) + token
117
+ prev_is_special = True
118
+ current_sub_tokens = []
119
+ else:
120
+ current_sub_tokens.append(token)
121
+ prev_is_special = False
122
+ out_string += self.sp_model.decode(current_sub_tokens)
123
+ return out_string
124
+
125
+ def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]:
126
+ """
127
+ Save the vocabulary and special tokens file to a directory.
128
+
129
+ Args:
130
+ save_directory (`str`):
131
+ The directory in which to save the vocabulary.
132
+
133
+ Returns:
134
+ `Tuple(str)`: Paths to the files saved.
135
+ """
136
+ if not os.path.isdir(save_directory):
137
+ logger.error(f"Vocabulary path ({save_directory}) should be a directory")
138
+ return
139
+ out_vocab_file = os.path.join(
140
+ save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
141
+ )
142
+
143
+ if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
144
+ copyfile(self.vocab_file, out_vocab_file)
145
+ elif not os.path.isfile(self.vocab_file):
146
+ with open(out_vocab_file, "wb") as fi:
147
+ content_spiece_model = self.sp_model.serialized_model_proto()
148
+ fi.write(content_spiece_model)
149
+
150
+ return (out_vocab_file,)
151
+
152
+ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
153
+ bos_token_id = [self.bos_token_id] if self.add_bos_token else []
154
+ eos_token_id = [self.eos_token_id] if self.add_eos_token else []
155
+
156
+ output = bos_token_id + token_ids_0 + eos_token_id
157
+
158
+ if token_ids_1 is not None:
159
+ output = output + bos_token_id + token_ids_1 + eos_token_id
160
+
161
+ return output
162
+
163
+ def get_special_tokens_mask(
164
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
165
+ ) -> List[int]:
166
+ """
167
+ Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
168
+ special tokens using the tokenizer `prepare_for_model` method.
169
+
170
+ Args:
171
+ token_ids_0 (`List[int]`):
172
+ List of IDs.
173
+ token_ids_1 (`List[int]`, *optional*):
174
+ Optional second list of IDs for sequence pairs.
175
+ already_has_special_tokens (`bool`, *optional*, defaults to `False`):
176
+ Whether or not the token list is already formatted with special tokens for the model.
177
+
178
+ Returns:
179
+ `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
180
+ """
181
+ if already_has_special_tokens:
182
+ return super().get_special_tokens_mask(
183
+ token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
184
+ )
185
+
186
+ bos_token_id = [1] if self.add_bos_token else []
187
+ eos_token_id = [1] if self.add_eos_token else []
188
+
189
+ if token_ids_1 is None:
190
+ return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id
191
+ return (
192
+ bos_token_id
193
+ + ([0] * len(token_ids_0))
194
+ + eos_token_id
195
+ + bos_token_id
196
+ + ([0] * len(token_ids_1))
197
+ + eos_token_id
198
+ )
199
+
200
+ def create_token_type_ids_from_sequences(
201
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
202
+ ) -> List[int]:
203
+ """
204
+ Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT
205
+ sequence pair mask has the following format:
206
+
207
+ ```
208
+ 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
209
+ | first sequence | second sequence |
210
+ ```
211
+
212
+ if token_ids_1 is None, only returns the first portion of the mask (0s).
213
+
214
+ Args:
215
+ token_ids_0 (`List[int]`):
216
+ List of ids.
217
+ token_ids_1 (`List[int]`, *optional*):
218
+ Optional second list of IDs for sequence pairs.
219
+
220
+ Returns:
221
+ `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
222
+ """
223
+ bos_token_id = [self.bos_token_id] if self.add_bos_token else []
224
+ eos_token_id = [self.eos_token_id] if self.add_eos_token else []
225
+
226
+ output = [0] * len(bos_token_id + token_ids_0 + eos_token_id)
227
+
228
+ if token_ids_1 is not None:
229
+ output += [1] * len(bos_token_id + token_ids_1 + eos_token_id)
230
+
231
+ return output
232
+
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ transformers
2
+ numpy
3
+ setuptools
4
+ torch
5
+ bfloat16
6
+ sentencepiece