File size: 6,134 Bytes
9dc0e21
 
 
8988bbf
 
 
931d3ff
8988bbf
 
 
 
 
 
 
 
f0929ee
 
 
 
 
 
 
9dc0e21
 
c38b609
e74047c
973bde6
 
41bb1cf
8988bbf
 
f29252d
5799733
459fbe3
9dc0e21
 
 
 
b099d9e
 
 
931d3ff
 
b099d9e
 
931d3ff
 
459fbe3
 
931d3ff
f0929ee
b420ebd
c38b609
 
 
 
 
931d3ff
c38b609
f0929ee
 
c38b609
 
973bde6
 
 
e74047c
 
459fbe3
 
 
 
e74047c
5799733
e74047c
 
 
 
 
 
c38b609
b099d9e
 
 
 
9dc0e21
 
b099d9e
 
 
 
 
 
 
 
 
 
9dc0e21
e74047c
b099d9e
e74047c
b099d9e
 
 
 
 
9dc0e21
e74047c
b099d9e
e74047c
b099d9e
9dc0e21
b099d9e
3ac04fa
b6dd571
 
b099d9e
 
e74047c
 
 
 
b6dd571
b099d9e
e74047c
 
b099d9e
b420ebd
b099d9e
 
b420ebd
b099d9e
 
9dc0e21
459fbe3
8988bbf
9dc0e21
 
8988bbf
 
 
 
 
 
9dc0e21
ed3e00b
 
55f05a7
ed3e00b
 
 
 
55f05a7
ed3e00b
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
"""
https://github.com/abetlen/llama-cpp-python/blob/main/examples/gradio_chat/local.py
https://github.com/awinml/llama-cpp-python-bindings

python convert_hf_to_gguf.py --outtype f16 Qwen1.5-0.5B-Chat

python convert_hf_to_gguf.py /workspace/xusong/huggingface/models/Qwen2-0.5B-Instruct/


./llama-cli -m /workspace/xusong/huggingface/models/Qwen1.5-0.5B-Chat/Qwen1.5-0.5B-Chat-F16.gguf -p "I believe the meaning of life is" -n 128

./llama-cli -m /workspace/xusong/huggingface/models/Qwen1.5-0.5B-Chat/Qwen1.5-0.5B-Chat-F16.gguf -f prompt.txt -n 128

./llama-cli -m /workspace/xusong/huggingface/models/Qwen1.5-0.5B-Chat/Qwen1.5-0.5B-Chat-F16.gguf -p "You are a helpful assistant" -cnv


## reference

- https://github.com/langchain-ai/langchain/blob/master/libs/community/langchain_community/llms/llamacpp.py
- https://github.com/abetlen/llama-cpp-python/blob/main/examples/gradio_chat/server.py
- https://github.com/abetlen/llama-cpp-python/blob/main/llama_cpp/server/app.py

"""

import json
import copy
import os

from models.base_model import Simulator
import llama_cpp
# import llama_cpp.llama_tokenizer
from transformers import AutoTokenizer
from utils.logging_util import logger
import config


class Qwen2Simulator(Simulator):

    def __init__(self):
        local_path = "/workspace/xusong/huggingface/models/Qwen2-0.5B-Instruct-GGUF/qwen2-0_5b-instruct-fp16.gguf"
        if os.path.exists(local_path):
            self.hf_tokenizer = AutoTokenizer.from_pretrained(
                "/workspace/xusong/huggingface/models/Qwen2-0.5B-Instruct/")
            self.llm = llama_cpp.Llama(  # n_ctx, n_threads
                model_path=local_path,
                # 默认的tokenizer有bug,tokenize后的id不同
                tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer(self.hf_tokenizer),
                n_ctx=config.MAX_SEQUENCE_LENGTH,  #
                # n_threads=None, # 默认会根据cpu数来设置 n_threads

                use_mlock=True,
                verbose=True,
            )
        else:
            self.hf_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
            self.llm = llama_cpp.Llama.from_pretrained(
                repo_id="Qwen/Qwen2-0.5B-Instruct-GGUF",
                tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer(self.hf_tokenizer),
                filename="*fp16.gguf",
                n_ctx=config.MAX_SEQUENCE_LENGTH,
                use_mlock=True,
                verbose=False,
            )
        logger.info(f"llm has been initialized: {self.llm}, "
                    f"n_threads={self.llm.n_threads}, n_ctx={self.llm.n_ctx}, "
                    f"env[CACHE]={os.environ.get('CACHE', None)}")

        self.generation_kwargs = dict(
            temperature=config.DEFAULT_TEMPERATURE,
            top_p=config.DEFAULT_TOP_P,
            top_k=config.DEFAULT_TOP_K,
            max_tokens=config.DEFAULT_MAX_TOKENS,
            repeat_penalty=1.1,
            # qwen2-0.5b-chat 有时内容生成结束没有<|im_end|>,直接跟 <|im_start|>
            stop=[
                "<|im_end|>",
                "<|im_start|>",
                "<|endoftext|>",
            ],
        )

    def tokenize(self, text):
        return self.llm.tokenize(text.encode("utf-8"))

    def generate_query(self, message, history_tokens, stream=True):
        """
        """
        # {% for message in messages %}
        #   {% if loop.first and messages[0]['role'] != 'system' %}
        #     {{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}
        #   {% endif %}
        #   {{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}
        # {% endfor %}
        # {% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",

        input_ids = history_tokens + self.tokenize(
            f"<|im_start|>{message['role']}\n{message['content']}<|im_end|>\n<|im_start|>user\n"
        )
        if stream:
            return self._stream_generate(input_ids)
        else:
            return self._generate(input_ids)

    def generate_response(self, message, history_tokens, stream=True):
        input_ids = history_tokens + self.tokenize(
            f"<|im_start|>{message['role']}\n{message['content']}<|im_end|>\n<|im_start|>assistant\n"
        )
        if stream:
            return self._stream_generate(input_ids)
        else:
            return self._generate(input_ids)

    def _stream_generate(self, input_ids):
        logger.info(f"generation_kwargs {self.generation_kwargs}")

        # self.llm.generate  .set_cache   .last_n_tokens_size  .reset  .ctx ._ctx
        output = self.llm.create_completion(
            input_ids,
            stream=True,
            **self.generation_kwargs
        )
        generated_text = ""
        # TODO: 检测finish reason,如果是length,则shift,并继续生成。
        # TODO: 返回 token_id,
        for out in output:
            stream = copy.deepcopy(out)
            if stream["choices"][0]["finish_reason"] is None:
                generated_text += stream["choices"][0]["text"]
                if "completion_text" in stream["choices"][0]:
                    yield stream["choices"][0]["completion_text"], stream["choices"][0]["all_tokens"]
                else:
                    logger.info("completion_text not found")
                    yield generated_text, None


bot = Qwen2Simulator()

if __name__ == "__main__":
    # messages = [
    #     {"role": "system", "content": "you are a helpful assistant"},
    #     {"role": "user", "content": "What is the capital of France?"}
    # ]
    # output = bot.generate_response(messages)
    # print(output)

    message = {"role": "system", "content": "你是一个导游。"}
    print(message)
    for generated_text, all_tokens in bot.generate_query(message, [], stream=True):
        print(generated_text, all_tokens)

    message = {"role": "user", "content": generated_text}
    print(message)
    for generated_text, all_tokens in bot.generate_query(message, all_tokens, stream=True):
        print(generated_text, all_tokens)