add inference code
Browse files
README.md
CHANGED
@@ -52,3 +52,37 @@ This is Bloomz-7b1-mt model fine-tuned with multilingual instruction dataset and
|
|
52 |
- Thai
|
53 |
- Vietnamese
|
54 |
- Chinese
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
52 |
- Thai
|
53 |
- Vietnamese
|
54 |
- Chinese
|
55 |
+
|
56 |
+
## Usage
|
57 |
+
|
58 |
+
Following is the code to do the inference using this model:
|
59 |
+
```
|
60 |
+
import torch
|
61 |
+
from peft import PeftModel, PeftConfig
|
62 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
|
63 |
+
|
64 |
+
peft_model_id = "cahya/bloomz-7b1-instruct"
|
65 |
+
config = PeftConfig.from_pretrained(peft_model_id)
|
66 |
+
|
67 |
+
model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path, return_dict=True,
|
68 |
+
load_in_8bit=True, device_map='auto')
|
69 |
+
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
|
70 |
+
|
71 |
+
# Load the Lora model
|
72 |
+
model = PeftModel.from_pretrained(model, peft_model_id)
|
73 |
+
|
74 |
+
batch = tokenizer("User: How old is the universe?\nAssistant: ", return_tensors='pt').to(0)
|
75 |
+
|
76 |
+
|
77 |
+
with torch.cuda.amp.autocast():
|
78 |
+
output_tokens = model.generate(**batch, max_new_tokens=200,
|
79 |
+
min_length=50,
|
80 |
+
do_sample=True,
|
81 |
+
top_k=40,
|
82 |
+
top_p=0.9,
|
83 |
+
temperature=0.2,
|
84 |
+
repetition_penalty=1.2,
|
85 |
+
num_return_sequences=1)
|
86 |
+
|
87 |
+
print('\n\n', tokenizer.decode(output_tokens[0], skip_special_tokens=True))
|
88 |
+
```
|