File size: 3,129 Bytes
9443a16
 
 
 
 
 
4e2b6c5
e476f12
9443a16
 
e476f12
9443a16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
710394e
9443a16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5135aa3
9443a16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import os
import spaces
import gradio as gr
from transformers import AutoTokenizer
from vllm import LLM, SamplingParams

model = os.environ.get("MODEL_ID")
model_name = model.split("/")[-1]

DESCRIPTION = f"""
<h3>MODEL: <a href="https://hf.co/{model}">{model_name}</a></h3>
<center>
<p>Qwen is the large language model built by Alibaba Cloud.
<br>
Feel free to test without log.
</p>
</center>
"""

css="""
h1 {
    text-align: center;
}
footer {
    visibility: hidden;
}
"""


# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model)

# Pass the default decoding hyperparameters of Qwen2-7B-Instruct
# max_tokens is for the maximum length for generation.

# Input the model name or path. Can be GPTQ or AWQ models.
llm = LLM(model=model, quantization="gptq", kv_cache_dtype="fp8_e5m2")

@spaces.GPU
def generate(message, history, system, max_tokens, temperature, top_p, top_k, penalty):
    # Prepare your prompts
    conversation = [
        {"role": "system", "content":sytem}
    ]
    for prompt, answer in history:
        conversation.extend([{"role": "user", "content": prompt}, {"role": "assistant", "content": answer}])
    conversation.append({"role": "user", "content": message})

    print(f"Conversation is -\n{conversation}")
    
    text = tokenizer.apply_chat_template(
        conversation,
        tokenize=False,
        add_generation_prompt=True
    )
    sampling_params = SamplingParams(
        temperature=temperature, 
        top_p=top_p, 
        top_k=top_k,
        repetition_penalty=penalty, 
        max_tokens=max_tokens,
        eos_token_id=[151645,151643],
    )
    # generate outputs
    outputs = llm.generate([text], sampling_params)
    
    # Print the outputs.
    for output in outputs:
        prompt = output.prompt
        generated_text = output.outputs[0].text
        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
        return generated_text


    

with gr.Blocks(css=css, fill_height=True) as demo:
    gr.HTML(TITLE)
    gr.HTML(DESCRIPTION)
    gr.ChatInterface(
        fn=generate,
        chatbot=gr.Chatbot(scale=1),
        additional_inputs=[
            gr.Textbox(value="You are a helpful assistant.", label="System message"),
            gr.Slider(minimum=1, maximum=30720, value=2048, step=1, label="Max tokens"),
            gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
            gr.Slider(
                minimum=0.1,
                maximum=1.0,
                value=0.95,
                step=0.05,
                label="Top-p",
            ),
            gr.Slider(
                minimum=0,
                maximum=20,
                value=20,
                step=1,
                label="Top-k",
            ),
            gr.Slider(
                minimum=0.0,
                maximum=2.0,
                value=1,
                step=0.1,
                label="Repetition penalty",
            ),
        ],
        retry_btn="Retry",
        undo_btn="Undo",
        clear_btn="Clear",
        submit_btn="Send",
    )

if __name__ == "__main__":
    demo.launch()