File size: 2,875 Bytes
9443a16
 
 
 
 
 
4e2b6c5
e476f12
9443a16
 
e476f12
9443a16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60bcc4a
9443a16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60bcc4a
9443a16
 
 
 
 
 
 
 
 
 
 
 
5135aa3
9443a16
 
 
1aa87ff
9443a16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import os
import spaces
import gradio as gr
from transformers import AutoTokenizer
from vllm import LLM, SamplingParams

model = os.environ.get("MODEL_ID")
model_name = model.split("/")[-1]

DESCRIPTION = f"""
<h3>MODEL: <a href="https://hf.co/{model}">{model_name}</a></h3>
<center>
<p>Qwen is the large language model built by Alibaba Cloud.
<br>
Feel free to test without log.
</p>
</center>
"""

css="""
h1 {
    text-align: center;
}
footer {
    visibility: hidden;
}
"""


# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model)



@spaces.GPU
def generate(message, history, system, max_tokens, temperature, top_p, top_k, penalty):
    # Prepare your prompts
    conversation = [
        {"role": "system", "content":sytem}
    ]
    for prompt, answer in history:
        conversation.extend([{"role": "user", "content": prompt}, {"role": "assistant", "content": answer}])
    conversation.append({"role": "user", "content": message})

    print(f"Conversation is -\n{conversation}")
    
    text = tokenizer.apply_chat_template(
        conversation,
        tokenize=False,
        add_generation_prompt=True
    )
    sampling_params = SamplingParams(
        temperature=temperature, 
        top_p=top_p, 
        top_k=top_k,
        repetition_penalty=penalty, 
        max_tokens=max_tokens,
        eos_token_id=[151645,151643],
    )
    # generate outputs
    llm = LLM(model=model)
    outputs = llm.generate([text], sampling_params)
    
    # Print the outputs.
    for output in outputs:
        prompt = output.prompt
        generated_text = output.outputs[0].text
        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
        return generated_text


    

with gr.Blocks(css=css, fill_height=True) as demo:
    gr.HTML(DESCRIPTION)
    gr.ChatInterface(
        fn=generate,
        chatbot=chatbot,
        additional_inputs=[
            gr.Textbox(value="You are a helpful assistant.", label="System message"),
            gr.Slider(minimum=1, maximum=30720, value=2048, step=1, label="Max tokens"),
            gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
            gr.Slider(
                minimum=0.1,
                maximum=1.0,
                value=0.95,
                step=0.05,
                label="Top-p",
            ),
            gr.Slider(
                minimum=0,
                maximum=20,
                value=20,
                step=1,
                label="Top-k",
            ),
            gr.Slider(
                minimum=0.0,
                maximum=2.0,
                value=1,
                step=0.1,
                label="Repetition penalty",
            ),
        ],
        retry_btn="Retry",
        undo_btn="Undo",
        clear_btn="Clear",
        submit_btn="Send",
    )

if __name__ == "__main__":
    demo.launch()