aashish1904
commited on
Commit
•
4909a51
1
Parent(s):
368b6e1
Upload README.md with huggingface_hub
Browse files
README.md
CHANGED
@@ -30,6 +30,7 @@ This is quantized version of [THUDM/LongWriter-glm4-9b](https://huggingface.co/T
|
|
30 |
|
31 |
LongWriter-glm4-9b is trained based on [glm-4-9b](https://huggingface.co/THUDM/glm-4-9b), and is capable of generating 10,000+ words at once.
|
32 |
|
|
|
33 |
|
34 |
A simple demo for deployment of the model:
|
35 |
```python
|
@@ -39,20 +40,39 @@ tokenizer = AutoTokenizer.from_pretrained("THUDM/LongWriter-glm4-9b", trust_remo
|
|
39 |
model = AutoModelForCausalLM.from_pretrained("THUDM/LongWriter-glm4-9b", torch_dtype=torch.bfloat16, trust_remote_code=True, device_map="auto")
|
40 |
model = model.eval()
|
41 |
query = "Write a 10000-word China travel guide"
|
42 |
-
|
43 |
-
input = tokenizer(prompt, truncation=False, return_tensors="pt").to(device)
|
44 |
-
context_length = input.input_ids.shape[-1]
|
45 |
-
output = model.generate(
|
46 |
-
**input,
|
47 |
-
max_new_tokens=32768,
|
48 |
-
num_beams=1,
|
49 |
-
do_sample=True,
|
50 |
-
temperature=0.5,
|
51 |
-
)[0]
|
52 |
-
response = tokenizer.decode(output[context_length:], skip_special_tokens=True)
|
53 |
print(response)
|
54 |
```
|
55 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
56 |
|
57 |
License: [glm-4-9b License](https://huggingface.co/THUDM/glm-4-9b-chat/blob/main/LICENSE)
|
58 |
|
|
|
30 |
|
31 |
LongWriter-glm4-9b is trained based on [glm-4-9b](https://huggingface.co/THUDM/glm-4-9b), and is capable of generating 10,000+ words at once.
|
32 |
|
33 |
+
Environment: Same environment requirement as [glm-4-9b-chat](https://huggingface.co/THUDM/glm-4-9b-chat) (`transforemrs>=4.43.0`).
|
34 |
|
35 |
A simple demo for deployment of the model:
|
36 |
```python
|
|
|
40 |
model = AutoModelForCausalLM.from_pretrained("THUDM/LongWriter-glm4-9b", torch_dtype=torch.bfloat16, trust_remote_code=True, device_map="auto")
|
41 |
model = model.eval()
|
42 |
query = "Write a 10000-word China travel guide"
|
43 |
+
response, history = model.chat(tokenizer, query, history=[], max_new_tokens=32768, temperature=0.5)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
print(response)
|
45 |
```
|
46 |
+
You can also deploy the model with [vllm](https://github.com/vllm-project/vllm), which allows 10,000+ words generation within a minute. Here is an example code:
|
47 |
+
```python
|
48 |
+
from vllm import LLM, SamplingParams
|
49 |
+
model = LLM(
|
50 |
+
model= "THUDM/LongWriter-glm4-9b",
|
51 |
+
dtype="auto",
|
52 |
+
trust_remote_code=True,
|
53 |
+
tensor_parallel_size=1,
|
54 |
+
max_model_len=32768,
|
55 |
+
gpu_memory_utilization=1,
|
56 |
+
)
|
57 |
+
tokenizer = model.get_tokenizer()
|
58 |
+
stop_token_ids = [tokenizer.eos_token_id, tokenizer.get_command("<|user|>"), tokenizer.get_command("<|observation|>")]
|
59 |
+
generation_params = SamplingParams(
|
60 |
+
temperature=0.5,
|
61 |
+
top_p=0.8,
|
62 |
+
top_k=50,
|
63 |
+
max_tokens=32768,
|
64 |
+
repetition_penalty=1,
|
65 |
+
stop_token_ids=stop_token_ids
|
66 |
+
)
|
67 |
+
query = "Write a 10000-word China travel guide"
|
68 |
+
input_ids = tokenizer.build_chat_input(query, history=[], role='user').input_ids[0].tolist()
|
69 |
+
outputs = model.generate(
|
70 |
+
sampling_params=generation_params,
|
71 |
+
prompt_token_ids=[input_ids],
|
72 |
+
)
|
73 |
+
output = outputs[0]
|
74 |
+
print(output.outputs[0].text)
|
75 |
+
```
|
76 |
|
77 |
License: [glm-4-9b License](https://huggingface.co/THUDM/glm-4-9b-chat/blob/main/LICENSE)
|
78 |
|