Update README.md
Browse files
README.md
CHANGED
@@ -131,6 +131,87 @@ print(tokenizer.decode(outputs[0]))
|
|
131 |
|
132 |
**Important Note:** Models based on Gemma 2 such as BgGPT-Gemma-2-2.6B-IT-v1.0 do not support flash attention. Using it results in degraded performance.
|
133 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
134 |
# Use with GGML / llama.cpp
|
135 |
|
136 |
The model and instructions for usage in GGUF format are available at [INSAIT-Institute/BgGPT-Gemma-2-2.6B-IT-v1.0-GGUF](https://huggingface.co/INSAIT-Institute/BgGPT-Gemma-2-2.6B-IT-v1.0-GGUF).
|
|
|
131 |
|
132 |
**Important Note:** Models based on Gemma 2 such as BgGPT-Gemma-2-2.6B-IT-v1.0 do not support flash attention. Using it results in degraded performance.
|
133 |
|
134 |
+
```python
|
135 |
+
tokenizer = AutoTokenizer.from_pretrained(
|
136 |
+
"INSAIT-Institute/BgGPT-Gemma-2-27B-IT-v1.0",
|
137 |
+
use_default_system_prompt=False,
|
138 |
+
)
|
139 |
+
|
140 |
+
messages = [
|
141 |
+
{"role": "user", "content": "Кога е основан Софийският университет?"},
|
142 |
+
]
|
143 |
+
|
144 |
+
input_ids = tokenizer.apply_chat_template(
|
145 |
+
messages,
|
146 |
+
return_tensors="pt",
|
147 |
+
add_generation_prompt=True,
|
148 |
+
return_dict=True
|
149 |
+
)
|
150 |
+
|
151 |
+
outputs = model.generate(
|
152 |
+
**input_ids,
|
153 |
+
generation_config=generation_params
|
154 |
+
)
|
155 |
+
print(tokenizer.decode(outputs[0]))
|
156 |
+
```
|
157 |
+
|
158 |
+
**Important Note:** Models based on Gemma 2 such as BgGPT-Gemma-2-2.6B-IT-v1.0 do not support flash attention. Using it results in degraded performance.
|
159 |
+
|
160 |
+
# Use with vLLM
|
161 |
+
|
162 |
+
Example usage with vLLM:
|
163 |
+
|
164 |
+
```python
|
165 |
+
from vllm import LLM, SamplingParams
|
166 |
+
from vllm.inputs import TokensPrompt
|
167 |
+
from transformers import AutoTokenizer
|
168 |
+
|
169 |
+
tokenizer = AutoTokenizer.from_pretrained(
|
170 |
+
"INSAIT-Institute/BgGPT-Gemma-2-2.6B-IT-v1.0",
|
171 |
+
use_default_system_prompt=False,
|
172 |
+
)
|
173 |
+
|
174 |
+
sampling_params = SamplingParams(
|
175 |
+
max_tokens=2048,
|
176 |
+
temperature=0.1,
|
177 |
+
top_k=25,
|
178 |
+
top_p=1,
|
179 |
+
repetition_penalty=1.1,
|
180 |
+
stop_token_ids=[1, 107],
|
181 |
+
)
|
182 |
+
|
183 |
+
llm = LLM(
|
184 |
+
model="INSAIT-Institute/BgGPT-Gemma-2-2.6B-IT-v1.0",
|
185 |
+
dtype="bfloat16",
|
186 |
+
enforce_eager=True
|
187 |
+
)
|
188 |
+
|
189 |
+
messages = [
|
190 |
+
{"role": "user", "content": "Кога е основан Софийският университет?"},
|
191 |
+
]
|
192 |
+
|
193 |
+
formatted_prompt = tokenizer.apply_chat_template(
|
194 |
+
messages,
|
195 |
+
tokenize=False,
|
196 |
+
add_generation_prompt=True
|
197 |
+
)
|
198 |
+
|
199 |
+
input_ids = tokenizer(
|
200 |
+
formatted_prompt,
|
201 |
+
add_special_tokens=False
|
202 |
+
).input_ids
|
203 |
+
|
204 |
+
prompt = TokensPrompt(prompt_token_ids=input_ids)
|
205 |
+
|
206 |
+
output = llm.generate(
|
207 |
+
prompt,
|
208 |
+
sampling_params
|
209 |
+
)
|
210 |
+
|
211 |
+
generated_text = output[0].outputs[0].text
|
212 |
+
print(generated_text)
|
213 |
+
```
|
214 |
+
|
215 |
# Use with GGML / llama.cpp
|
216 |
|
217 |
The model and instructions for usage in GGUF format are available at [INSAIT-Institute/BgGPT-Gemma-2-2.6B-IT-v1.0-GGUF](https://huggingface.co/INSAIT-Institute/BgGPT-Gemma-2-2.6B-IT-v1.0-GGUF).
|