File size: 4,094 Bytes
024fe98
5f33c5c
 
 
024fe98
baecb1b
 
 
 
024fe98
baecb1b
 
5b2b87c
 
024fe98
baecb1b
 
 
 
 
 
 
 
 
5f33c5c
 
baecb1b
 
024fe98
5213619
baecb1b
024fe98
baecb1b
 
0e85851
024fe98
baecb1b
024fe98
baecb1b
5f33c5c
 
 
 
 
 
baecb1b
 
 
0e85851
baecb1b
 
 
5f33c5c
 
 
 
baecb1b
 
5f33c5c
 
 
 
baecb1b
 
 
 
 
 
 
0fef393
024fe98
baecb1b
 
024fe98
 
baecb1b
024fe98
baecb1b
0e85851
 
54d9395
0e85851
 
 
 
 
7a325ef
0e85851
 
 
baecb1b
0e85851
baecb1b
5f33c5c
 
 
 
baecb1b
5f33c5c
024fe98
0fef393
 
 
baecb1b
 
 
 
 
024fe98
baecb1b
 
 
 
5f33c5c
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
# app.py
# =============
# This is a complete app.py file for a text generation app using the Qwen/Qwen2.5-Coder-0.5B-Instruct model.
# The app uses the Gradio library to create a web interface for interacting with the model.

# Imports
# =======
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer

# Constants
# =========
MODEL_NAME = "prithivMLmods/Llama-Magpie-3.2-3B-Instruct"
SYSTEM_MESSAGE = "you are an AI assistant, and your name is Llama-Magpie-3.2-3B-Instruct"

# Load Model and Tokenizer
# ========================
def load_model_and_tokenizer():
    """
    Load the model and tokenizer from Hugging Face.
    """
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        torch_dtype="auto",
        device_map="cpu"  # Ensure the model runs on the CPU
    )
    return model, tokenizer

# Ensure the model and tokenizer are loaded
model, tokenizer = load_model_and_tokenizer()

# Generate Response
# =================
def generate_response(prompt, chat_history, max_new_tokens, temperature):
    """
    Generate a response from the model based on the user prompt and chat history.
    """
    messages = [{"role": "system", "content": SYSTEM_MESSAGE}] + chat_history + [{"role": "user", "content": prompt}]
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

    generated_ids = model.generate(
        **model_inputs,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        top_k=50,
        top_p=0.95,
        temperature=temperature,
        output_scores=True,
        return_dict_in_generate=True,
        return_legacy_cache=True  # Ensure legacy format is returned
    )

    response = ""
    for token_id in generated_ids.sequences[0][len(model_inputs.input_ids[0]):]:
        response += tokenizer.decode([token_id], skip_special_tokens=True)
        yield chat_history + [{"role": "assistant", "content": response}]

# Clear Chat History
# ==================
def clear_chat():
    """
    Clear the chat history.
    """
    return [], ""

# Gradio Interface
# =================
def gradio_interface():
    """
    Create and launch the Gradio interface.
    """
    with gr.Blocks() as demo:
        with gr.Row():
            with gr.Column(scale=3):
                chatbot = gr.Chatbot(label="Chat with prithivMLmods/Llama-Magpie-3.2-3B-Instruct", type="messages")
                msg = gr.Textbox(label="User Input")
                with gr.Row():
                    submit = gr.Button("Submit")
                    clear = gr.Button("Clear Chat")
            with gr.Column(scale=1):
                with gr.Group():
                    gr.Markdown("### Settings")
                    max_new_tokens = gr.Slider(50, 1024, value=512, step=1, label="Max New Tokens")
                    temperature = gr.Slider(0.1, 1.0, value=0.7, step=0.05, label="Temperature")

        def respond(message, chat_history, max_new_tokens, temperature):
            chat_history.append({"role": "user", "content": message})
            response = ""
            for chunk in generate_response(message, chat_history, max_new_tokens, temperature):
                response = chunk[-1]["content"]
                yield chat_history, ""
            chat_history.append({"role": "assistant", "content": response})
            yield chat_history, ""

        submit.click(respond, [msg, chatbot, max_new_tokens, temperature], [chatbot, msg])
        msg.submit(respond, [msg, chatbot, max_new_tokens, temperature], [chatbot, msg])
        clear.click(clear_chat, None, [chatbot, msg])

    demo.launch()

# Main
# ====
if __name__ == "__main__":
    gradio_interface()

# Dependencies
# =============
# The following dependencies are required to run this app:
# - transformers
# - gradio
# - torch
# - accelerate
#
# You can install these dependencies using pip:
# pip install transformers gradio torch accelerate