File size: 6,134 Bytes
9dc0e21 8988bbf 931d3ff 8988bbf f0929ee 9dc0e21 c38b609 e74047c 973bde6 41bb1cf 8988bbf f29252d 5799733 459fbe3 9dc0e21 b099d9e 931d3ff b099d9e 931d3ff 459fbe3 931d3ff f0929ee b420ebd c38b609 931d3ff c38b609 f0929ee c38b609 973bde6 e74047c 459fbe3 e74047c 5799733 e74047c c38b609 b099d9e 9dc0e21 b099d9e 9dc0e21 e74047c b099d9e e74047c b099d9e 9dc0e21 e74047c b099d9e e74047c b099d9e 9dc0e21 b099d9e 3ac04fa b6dd571 b099d9e e74047c b6dd571 b099d9e e74047c b099d9e b420ebd b099d9e b420ebd b099d9e 9dc0e21 459fbe3 8988bbf 9dc0e21 8988bbf 9dc0e21 ed3e00b 55f05a7 ed3e00b 55f05a7 ed3e00b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 |
"""
https://github.com/abetlen/llama-cpp-python/blob/main/examples/gradio_chat/local.py
https://github.com/awinml/llama-cpp-python-bindings
python convert_hf_to_gguf.py --outtype f16 Qwen1.5-0.5B-Chat
python convert_hf_to_gguf.py /workspace/xusong/huggingface/models/Qwen2-0.5B-Instruct/
./llama-cli -m /workspace/xusong/huggingface/models/Qwen1.5-0.5B-Chat/Qwen1.5-0.5B-Chat-F16.gguf -p "I believe the meaning of life is" -n 128
./llama-cli -m /workspace/xusong/huggingface/models/Qwen1.5-0.5B-Chat/Qwen1.5-0.5B-Chat-F16.gguf -f prompt.txt -n 128
./llama-cli -m /workspace/xusong/huggingface/models/Qwen1.5-0.5B-Chat/Qwen1.5-0.5B-Chat-F16.gguf -p "You are a helpful assistant" -cnv
## reference
- https://github.com/langchain-ai/langchain/blob/master/libs/community/langchain_community/llms/llamacpp.py
- https://github.com/abetlen/llama-cpp-python/blob/main/examples/gradio_chat/server.py
- https://github.com/abetlen/llama-cpp-python/blob/main/llama_cpp/server/app.py
"""
import json
import copy
import os
from models.base_model import Simulator
import llama_cpp
# import llama_cpp.llama_tokenizer
from transformers import AutoTokenizer
from utils.logging_util import logger
import config
class Qwen2Simulator(Simulator):
def __init__(self):
local_path = "/workspace/xusong/huggingface/models/Qwen2-0.5B-Instruct-GGUF/qwen2-0_5b-instruct-fp16.gguf"
if os.path.exists(local_path):
self.hf_tokenizer = AutoTokenizer.from_pretrained(
"/workspace/xusong/huggingface/models/Qwen2-0.5B-Instruct/")
self.llm = llama_cpp.Llama( # n_ctx, n_threads
model_path=local_path,
# 默认的tokenizer有bug,tokenize后的id不同
tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer(self.hf_tokenizer),
n_ctx=config.MAX_SEQUENCE_LENGTH, #
# n_threads=None, # 默认会根据cpu数来设置 n_threads
use_mlock=True,
verbose=True,
)
else:
self.hf_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
self.llm = llama_cpp.Llama.from_pretrained(
repo_id="Qwen/Qwen2-0.5B-Instruct-GGUF",
tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer(self.hf_tokenizer),
filename="*fp16.gguf",
n_ctx=config.MAX_SEQUENCE_LENGTH,
use_mlock=True,
verbose=False,
)
logger.info(f"llm has been initialized: {self.llm}, "
f"n_threads={self.llm.n_threads}, n_ctx={self.llm.n_ctx}, "
f"env[CACHE]={os.environ.get('CACHE', None)}")
self.generation_kwargs = dict(
temperature=config.DEFAULT_TEMPERATURE,
top_p=config.DEFAULT_TOP_P,
top_k=config.DEFAULT_TOP_K,
max_tokens=config.DEFAULT_MAX_TOKENS,
repeat_penalty=1.1,
# qwen2-0.5b-chat 有时内容生成结束没有<|im_end|>,直接跟 <|im_start|>
stop=[
"<|im_end|>",
"<|im_start|>",
"<|endoftext|>",
],
)
def tokenize(self, text):
return self.llm.tokenize(text.encode("utf-8"))
def generate_query(self, message, history_tokens, stream=True):
"""
"""
# {% for message in messages %}
# {% if loop.first and messages[0]['role'] != 'system' %}
# {{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}
# {% endif %}
# {{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}
# {% endfor %}
# {% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
input_ids = history_tokens + self.tokenize(
f"<|im_start|>{message['role']}\n{message['content']}<|im_end|>\n<|im_start|>user\n"
)
if stream:
return self._stream_generate(input_ids)
else:
return self._generate(input_ids)
def generate_response(self, message, history_tokens, stream=True):
input_ids = history_tokens + self.tokenize(
f"<|im_start|>{message['role']}\n{message['content']}<|im_end|>\n<|im_start|>assistant\n"
)
if stream:
return self._stream_generate(input_ids)
else:
return self._generate(input_ids)
def _stream_generate(self, input_ids):
logger.info(f"generation_kwargs {self.generation_kwargs}")
# self.llm.generate .set_cache .last_n_tokens_size .reset .ctx ._ctx
output = self.llm.create_completion(
input_ids,
stream=True,
**self.generation_kwargs
)
generated_text = ""
# TODO: 检测finish reason,如果是length,则shift,并继续生成。
# TODO: 返回 token_id,
for out in output:
stream = copy.deepcopy(out)
if stream["choices"][0]["finish_reason"] is None:
generated_text += stream["choices"][0]["text"]
if "completion_text" in stream["choices"][0]:
yield stream["choices"][0]["completion_text"], stream["choices"][0]["all_tokens"]
else:
logger.info("completion_text not found")
yield generated_text, None
bot = Qwen2Simulator()
if __name__ == "__main__":
# messages = [
# {"role": "system", "content": "you are a helpful assistant"},
# {"role": "user", "content": "What is the capital of France?"}
# ]
# output = bot.generate_response(messages)
# print(output)
message = {"role": "system", "content": "你是一个导游。"}
print(message)
for generated_text, all_tokens in bot.generate_query(message, [], stream=True):
print(generated_text, all_tokens)
message = {"role": "user", "content": generated_text}
print(message)
for generated_text, all_tokens in bot.generate_query(message, all_tokens, stream=True):
print(generated_text, all_tokens)
|