File size: 4,848 Bytes
9dc0e21 8988bbf 9dc0e21 c38b609 e74047c 459fbe3 8988bbf f29252d 5799733 459fbe3 9dc0e21 c38b609 10e2ac5 459fbe3 c38b609 459fbe3 c38b609 459fbe3 e74047c 459fbe3 e74047c 5799733 e74047c c38b609 e74047c 9dc0e21 459fbe3 f29252d 9dc0e21 e74047c 9dc0e21 5799733 f29252d 9dc0e21 e74047c 9dc0e21 5799733 9dc0e21 e74047c 9dc0e21 e74047c 9dc0e21 459fbe3 8988bbf 9dc0e21 8988bbf 9dc0e21 e74047c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 |
"""
https://github.com/abetlen/llama-cpp-python/blob/main/examples/gradio_chat/local.py
https://github.com/awinml/llama-cpp-python-bindings
python convert_hf_to_gguf.py --outtype f16 Qwen1.5-0.5B-Chat
python convert_hf_to_gguf.py /workspace/xusong/huggingface/models/Qwen1.5-0.5B-Chat/
./llama-cli -m /workspace/xusong/huggingface/models/Qwen1.5-0.5B-Chat/Qwen1.5-0.5B-Chat-F16.gguf -p "I believe the meaning of life is" -n 128
./llama-cli -m /workspace/xusong/huggingface/models/Qwen1.5-0.5B-Chat/Qwen1.5-0.5B-Chat-F16.gguf -f prompt.txt -n 128
./llama-cli -m /workspace/xusong/huggingface/models/Qwen1.5-0.5B-Chat/Qwen1.5-0.5B-Chat-F16.gguf -p "You are a helpful assistant" -cnv
"""
import json
import copy
from base_model import Simulator
import llama_cpp
# import llama_cpp.llama_tokenizer
from transformers import AutoTokenizer
from utils.logging_util import logger
import config
class Qwen2Simulator(Simulator):
def __init__(self, from_local=False):
if from_local:
self.hf_tokenizer = AutoTokenizer.from_pretrained(
"/workspace/xusong/huggingface/models/Qwen2-0.5B-Instruct/")
self.llm = llama_cpp.Llama( # n_ctx, n_threads
model_path="/workspace/xusong/huggingface/models/Qwen2-0.5B-Instruct-GGUF/qwen2-0_5b-instruct-fp16.gguf",
tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer(self.hf_tokenizer),
n_ctx=config.MAX_SEQUENCE_LENGTH, #
# n_threads=None, # 默认会根据cpu数来设置 n_threads
verbose=False,
)
else:
self.hf_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
self.llm = llama_cpp.Llama.from_pretrained(
repo_id="Qwen/Qwen2-0.5B-Instruct-GGUF",
filename="*fp16.gguf",
tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer(self.hf_tokenizer),
verbose=False,
)
logger.info(f"llm has been initialized: {self.llm}, n_threads={self.llm.n_threads}, n_ctx={self.llm.n_ctx}")
self.generation_kwargs = dict(
temperature=config.DEFAULT_TEMPERATURE,
top_p=config.DEFAULT_TOP_P,
top_k=config.DEFAULT_TOP_K,
max_tokens=config.DEFAULT_MAX_TOKENS,
repeat_penalty=1.1,
# qwen2-0.5b-chat 有时内容生成结束没有<|im_end|>,直接跟 <|im_start|>
stop=[
"<|im_end|>",
"<|im_start|>",
"<|endoftext|>",
],
)
def generate_query(self, messages, stream=True):
"""
:param messages:
:return:
"""
assert messages[-1]["role"] != "user"
logger.info(f"generating {json.dumps(messages, ensure_ascii=False)}")
inputs = self.hf_tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=False,
)
inputs = inputs + "<|im_start|>user\n"
if stream:
return self._stream_generate(inputs)
else:
return self._generate(inputs)
def generate_response(self, messages, stream=True):
assert messages[-1]["role"] == "user"
logger.info(f"generating {json.dumps(messages, ensure_ascii=False)}")
inputs = self.hf_tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
if stream:
return self._stream_generate(inputs)
else:
return self._generate(inputs)
def _generate(self, inputs):
logger.info(f"generation_kwargs {self.generation_kwargs}")
output = self.llm(
inputs,
**self.generation_kwargs
)
output_text = output["choices"][0]["text"]
return output_text
def _stream_generate(self, inputs):
output = self.llm(
inputs,
stream=True,
**self.generation_kwargs
)
generated_text = ""
for out in output:
stream = copy.deepcopy(out)
generated_text += stream["choices"][0]["text"]
yield generated_text
bot = Qwen2Simulator()
if __name__ == "__main__":
# messages = [
# {"role": "system", "content": "you are a helpful assistant"},
# {"role": "user", "content": "What is the capital of France?"}
# ]
# output = bot.generate_response(messages)
# print(output)
messages = [
{"role": "system", "content": "you are a helpful assistant"},
{"role": "user", "content": "hi, what your name"},
{"role": "assistant", "content": "My name is Jordan"}
]
print(list(bot.generate_query(messages, stream=True)))
print(bot.generate_query(messages, stream=False))
|