adding streaming in the example provided
Browse files
README.md
CHANGED
@@ -33,6 +33,9 @@ The difference between this model and https://huggingface.co/mobiuslabsgmbh/Mixt
|
|
33 |
### Basic Usage
|
34 |
To run the model, install the HQQ library from https://github.com/mobiusml/hqq and use it as follows:
|
35 |
``` Python
|
|
|
|
|
|
|
36 |
model_id = 'mobiuslabsgmbh/Mixtral-8x7B-Instruct-v0.1-hf-attn-4bit-moe-2bit-metaoffload-HQQ'
|
37 |
#Load the model
|
38 |
from hqq.engine.hf import HQQModelForCausalLM, AutoTokenizer
|
@@ -46,11 +49,35 @@ model = HQQModelForCausalLM.from_quantized(model_id)
|
|
46 |
from hqq.core.quantize import *
|
47 |
HQQLinear.set_backend(HQQBackend.ATEN_BACKPROP)
|
48 |
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
54 |
```
|
55 |
|
56 |
|
@@ -60,6 +87,7 @@ You can reproduce the model using the following quant configs:
|
|
60 |
|
61 |
``` Python
|
62 |
from hqq.engine.hf import HQQModelForCausalLM, AutoTokenizer
|
|
|
63 |
model_id = "mistralai/Mixtral-8x7B-Instruct-v0.1"
|
64 |
model = HQQModelForCausalLM.from_pretrained(model_id, use_auth_token=hf_auth, cache_dir=cache_path)
|
65 |
|
|
|
33 |
### Basic Usage
|
34 |
To run the model, install the HQQ library from https://github.com/mobiusml/hqq and use it as follows:
|
35 |
``` Python
|
36 |
+
import transformers
|
37 |
+
from threading import Thread
|
38 |
+
|
39 |
model_id = 'mobiuslabsgmbh/Mixtral-8x7B-Instruct-v0.1-hf-attn-4bit-moe-2bit-metaoffload-HQQ'
|
40 |
#Load the model
|
41 |
from hqq.engine.hf import HQQModelForCausalLM, AutoTokenizer
|
|
|
49 |
from hqq.core.quantize import *
|
50 |
HQQLinear.set_backend(HQQBackend.ATEN_BACKPROP)
|
51 |
|
52 |
+
|
53 |
+
def chat_processor(chat, max_new_tokens=100, do_sample=True):
|
54 |
+
tokenizer.use_default_system_prompt = False
|
55 |
+
streamer = transformers.TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
|
56 |
+
|
57 |
+
generate_params = dict(
|
58 |
+
tokenizer("<s> [INST] " + chat + " [/INST] ", return_tensors="pt").to('cuda'),
|
59 |
+
streamer=streamer,
|
60 |
+
max_new_tokens=max_new_tokens,
|
61 |
+
do_sample=do_sample,
|
62 |
+
top_p=0.90,
|
63 |
+
top_k=50,
|
64 |
+
temperature= 0.6,
|
65 |
+
num_beams=1,
|
66 |
+
repetition_penalty=1.2,
|
67 |
+
)
|
68 |
+
|
69 |
+
t = Thread(target=model.generate, kwargs=generate_params)
|
70 |
+
t.start()
|
71 |
+
outputs = []
|
72 |
+
for text in streamer:
|
73 |
+
outputs.append(text)
|
74 |
+
print(text, end="", flush=True)
|
75 |
+
|
76 |
+
return outputs
|
77 |
+
|
78 |
+
################################################################################################
|
79 |
+
#Generation
|
80 |
+
outputs = chat_processor("How do I build a car?", max_new_tokens=1000, do_sample=False)
|
81 |
```
|
82 |
|
83 |
|
|
|
87 |
|
88 |
``` Python
|
89 |
from hqq.engine.hf import HQQModelForCausalLM, AutoTokenizer
|
90 |
+
|
91 |
model_id = "mistralai/Mixtral-8x7B-Instruct-v0.1"
|
92 |
model = HQQModelForCausalLM.from_pretrained(model_id, use_auth_token=hf_auth, cache_dir=cache_path)
|
93 |
|