File size: 4,919 Bytes
9dc0e21
 
 
8988bbf
 
 
 
 
 
 
 
 
 
 
 
9dc0e21
 
c38b609
e74047c
9dc0e21
8988bbf
 
f29252d
c38b609
9dc0e21
 
 
 
c38b609
 
10e2ac5
 
c38b609
 
 
 
 
 
 
 
 
 
 
 
 
10e2ac5
c38b609
 
e74047c
 
 
 
 
 
 
 
 
 
 
 
 
8988bbf
c38b609
e74047c
9dc0e21
 
 
 
 
c38b609
f29252d
9dc0e21
 
 
 
 
e74047c
 
 
 
 
9dc0e21
e74047c
9dc0e21
c38b609
f29252d
9dc0e21
 
 
 
e74047c
 
 
 
9dc0e21
 
8988bbf
e74047c
8988bbf
 
 
 
 
 
 
 
 
 
 
 
9dc0e21
 
e74047c
9dc0e21
 
 
 
e74047c
 
 
 
 
 
 
 
 
 
 
9dc0e21
8988bbf
9dc0e21
 
8988bbf
 
 
 
 
 
9dc0e21
 
 
 
 
 
e74047c
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
"""
https://github.com/abetlen/llama-cpp-python/blob/main/examples/gradio_chat/local.py
https://github.com/awinml/llama-cpp-python-bindings

python convert_hf_to_gguf.py --outtype f16 Qwen1.5-0.5B-Chat

python convert_hf_to_gguf.py /workspace/xusong/huggingface/models/Qwen1.5-0.5B-Chat/


./llama-cli -m /workspace/xusong/huggingface/models/Qwen1.5-0.5B-Chat/Qwen1.5-0.5B-Chat-F16.gguf -p "I believe the meaning of life is" -n 128

./llama-cli -m /workspace/xusong/huggingface/models/Qwen1.5-0.5B-Chat/Qwen1.5-0.5B-Chat-F16.gguf -f prompt.txt -n 128

./llama-cli -m /workspace/xusong/huggingface/models/Qwen1.5-0.5B-Chat/Qwen1.5-0.5B-Chat-F16.gguf -p "You are a helpful assistant" -cnv

"""

import json
import copy
from simulator import Simulator
import llama_cpp
# import llama_cpp.llama_tokenizer
from transformers import AutoTokenizer
from log_util import logger


class Qwen2Simulator(Simulator):

    def __init__(self, from_local=False):
        if from_local:
            self.hf_tokenizer = AutoTokenizer.from_pretrained(
                "/workspace/xusong/huggingface/models/Qwen2-0.5B-Instruct/")
            self.llm = llama_cpp.Llama(
                model_path="/workspace/xusong/huggingface/models/Qwen2-0.5B-Instruct-GGUF/qwen2-0_5b-instruct-fp16.gguf",
                tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer(self.hf_tokenizer),
                verbose=False,
            )
        else:
            self.hf_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
            self.llm = llama_cpp.Llama.from_pretrained(
                repo_id="Qwen/Qwen2-0.5B-Instruct-GGUF",
                filename="*fp16.gguf",
                tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer(self.hf_tokenizer),
                verbose=False,
            )
        logger.info(f"llm has been initialized: {self.llm}")
        # warmup


        self.generation_kwargs = dict(
            temperature=5,
            # top_p=0.1,
            top_k=40,
            max_tokens=20,
            repeat_penalty=1.1,
            stop=[
                "<|im_end|>",
                "<|im_start|>",
                "<|endoftext|>",
            ],
        )
        ### local

    def generate_query(self, messages, stream=True):
        """
        :param messages:
        :return:
        """
        assert messages[-1]["role"] != "user"
        logger.info(f"generating {json.dumps(messages)}")
        inputs = self.hf_tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=False,
        )
        inputs = inputs + "<|im_start|>user\n"
        if stream:
            return self._stream_generate(inputs)
        else:
            return self._generate(inputs)


    def generate_response(self, messages, stream=True):
        assert messages[-1]["role"] == "user"
        logger.info(f"generating {json.dumps(messages)}")
        inputs = self.hf_tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
        )
        if stream:
            return self._stream_generate(inputs)
        else:
            return self._generate(inputs)

    def _generate(self, inputs):
        """
        TODO: chat with cache.
        qwen2-0.5b-chat 有bug:有时user生成结束没有<|im_end|>,示例:
            <|im_start|>system
            you are a helpful assistant<|im_end|>
            <|im_start|>user
            hi, what your name<|im_end|>
            <|im_start|>assistant
            My name is Jordan<|im_end|>
            <|im_start|>user              # 以上是输入,以下是生成
            how old are you?
            <|im_start|>assistant
            I am a 41-year-old man.<|im_end|>
        """
        output = self.llm(
            inputs,
            **self.generation_kwargs
        )
        output_text = output["choices"][0]["text"]
        return output_text

    def _stream_generate(self, inputs):
        output = self.llm(
            inputs,
            stream=True,
            **self.generation_kwargs
        )
        generated_text = ""
        for out in output:
            stream = copy.deepcopy(out)
            generated_text += stream["choices"][0]["text"]
            yield generated_text

bot = Qwen2Simulator()

if __name__ == "__main__":
    # messages = [
    #     {"role": "system", "content": "you are a helpful assistant"},
    #     {"role": "user", "content": "What is the capital of France?"}
    # ]
    # output = bot.generate_response(messages)
    # print(output)

    messages = [
        {"role": "system", "content": "you are a helpful assistant"},
        {"role": "user", "content": "hi, what your name"},
        {"role": "assistant", "content": "My name is Jordan"}
    ]
    print(list(bot.generate_query(messages, stream=True)))
    print(bot.generate_query(messages, stream=False))