Update README.md
Browse files
README.md
CHANGED
@@ -1,20 +1,31 @@
|
|
1 |
---
|
2 |
language:
|
3 |
- en
|
|
|
4 |
library_name: transformers
|
|
|
5 |
tags:
|
6 |
- gpt
|
7 |
- llm
|
8 |
- large language model
|
9 |
- h2o-llmstudio
|
10 |
inference: false
|
11 |
-
thumbnail:
|
|
|
|
|
|
|
|
|
12 |
---
|
13 |
-
# Model Card
|
14 |
-
## Summary
|
15 |
|
16 |
-
|
17 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
|
19 |
|
20 |
## Usage
|
@@ -57,9 +68,8 @@ generate_text = pipeline(
|
|
57 |
# generate_text.model.generation_config.repetition_penalty = float(1.0)
|
58 |
|
59 |
messages = [
|
60 |
-
{"role": "
|
61 |
-
{"role": "
|
62 |
-
{"role": "user", "content": "Why is drinking water so healthy?"},
|
63 |
]
|
64 |
|
65 |
res = generate_text(
|
@@ -88,9 +98,8 @@ model_name = "haqishen/h2o-Llama-3-8B-Japanese-Instruct" # either local folder
|
|
88 |
# Important: The prompt needs to be in the same format the model was trained with.
|
89 |
# You can find an example prompt in the experiment logs.
|
90 |
messages = [
|
91 |
-
{"role": "
|
92 |
-
{"role": "
|
93 |
-
{"role": "user", "content": "Why is drinking water so healthy?"},
|
94 |
]
|
95 |
|
96 |
tokenizer = AutoTokenizer.from_pretrained(
|
@@ -133,6 +142,41 @@ answer = tokenizer.decode(tokens, skip_special_tokens=True)
|
|
133 |
print(answer)
|
134 |
```
|
135 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
136 |
## Quantization and sharding
|
137 |
|
138 |
You can load the models using quantization by specifying ```load_in_8bit=True``` or ```load_in_4bit=True```. Also, sharding on multiple GPUs is possible by setting ```device_map=auto```.
|
|
|
1 |
---
|
2 |
language:
|
3 |
- en
|
4 |
+
- ja
|
5 |
library_name: transformers
|
6 |
+
license: llama3
|
7 |
tags:
|
8 |
- gpt
|
9 |
- llm
|
10 |
- large language model
|
11 |
- h2o-llmstudio
|
12 |
inference: false
|
13 |
+
thumbnail: >-
|
14 |
+
https://h2o.ai/etc.clientlibs/h2o/clientlibs/clientlib-site/resources/images/favicon.ico
|
15 |
+
datasets:
|
16 |
+
- fujiki/japanese_hh-rlhf-49k
|
17 |
+
pipeline_tag: text-generation
|
18 |
---
|
|
|
|
|
19 |
|
20 |
+
## Introduction
|
21 |
+
|
22 |
+
This is a `meta-llama/Meta-Llama-3-8B-Instruct` model that finetuned on **Japanese** conversation dataset.
|
23 |
+
|
24 |
+
Dataset: [japanese_hh-rlhf-49k](https://huggingface.co/datasets/fujiki/japanese_hh-rlhf-49k)
|
25 |
+
|
26 |
+
Training framework: [h2o-llmstudio](https://github.com/h2oai/h2o-llmstudio)
|
27 |
+
|
28 |
+
Training max context length: 8k
|
29 |
|
30 |
|
31 |
## Usage
|
|
|
68 |
# generate_text.model.generation_config.repetition_penalty = float(1.0)
|
69 |
|
70 |
messages = [
|
71 |
+
{"role": "system", "content": "あなたは、常に海賊の言葉で返事する海賊チャットボットです!"},
|
72 |
+
{"role": "user", "content": "自己紹介してください"},
|
|
|
73 |
]
|
74 |
|
75 |
res = generate_text(
|
|
|
98 |
# Important: The prompt needs to be in the same format the model was trained with.
|
99 |
# You can find an example prompt in the experiment logs.
|
100 |
messages = [
|
101 |
+
{"role": "system", "content": "あなたは、常に海賊の言葉で返事する海賊チャットボットです!"},
|
102 |
+
{"role": "user", "content": "自己紹介してください"},
|
|
|
103 |
]
|
104 |
|
105 |
tokenizer = AutoTokenizer.from_pretrained(
|
|
|
142 |
print(answer)
|
143 |
```
|
144 |
|
145 |
+
|
146 |
+
### Use with vllm
|
147 |
+
|
148 |
+
[vllm-project/vllm](https://github.com/vllm-project/vllm)
|
149 |
+
|
150 |
+
```python
|
151 |
+
from vllm import LLM, SamplingParams
|
152 |
+
model_id = "haqishen/h2o-Llama-3-8B-Japanese-Instruct"
|
153 |
+
llm = LLM(
|
154 |
+
model=model_id,
|
155 |
+
trust_remote_code=True,
|
156 |
+
tensor_parallel_size=2,
|
157 |
+
)
|
158 |
+
tokenizer = llm.get_tokenizer()
|
159 |
+
messages = [
|
160 |
+
{"role": "system", "content": "あなたは、常に海賊の言葉で返事する海賊チャットボットです!"},
|
161 |
+
{"role": "user", "content": "自己紹介してください"},
|
162 |
+
]
|
163 |
+
conversations = tokenizer.apply_chat_template(
|
164 |
+
messages,
|
165 |
+
tokenize=False,
|
166 |
+
add_generation_prompt=True
|
167 |
+
)
|
168 |
+
outputs = llm.generate(
|
169 |
+
[conversations],
|
170 |
+
SamplingParams(
|
171 |
+
temperature=0.6,
|
172 |
+
top_p=0.9,
|
173 |
+
max_tokens=1024,
|
174 |
+
stop_token_ids=[tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids("<|eot_id|>")],
|
175 |
+
)
|
176 |
+
)
|
177 |
+
print(outputs[0].outputs[0].text.strip())
|
178 |
+
```
|
179 |
+
|
180 |
## Quantization and sharding
|
181 |
|
182 |
You can load the models using quantization by specifying ```load_in_8bit=True``` or ```load_in_4bit=True```. Also, sharding on multiple GPUs is possible by setting ```device_map=auto```.
|