File size: 2,891 Bytes
037a015
1ffa7d1
a9938e0
 
ce0d45f
a9938e0
 
 
1ccc577
a9938e0
 
 
1ccc577
a9938e0
 
 
 
 
 
 
1ccc577
1ffa7d1
a9938e0
 
 
 
1ccc577
a9938e0
 
ce0d45f
1ffa7d1
b42ac71
 
 
 
 
 
a9938e0
 
 
 
 
 
 
 
 
 
 
 
b42ac71
a9938e0
b42ac71
a9938e0
 
037a015
 
a9938e0
 
 
b42ac71
037a015
b42ac71
 
 
 
037a015
b42ac71
037a015
 
 
b42ac71
037a015
 
b42ac71
037a015
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import gradio as gr
import torch
from unsloth import FastLanguageModel
from transformers import TextStreamer

# Configuration Variables
model_name = "unsloth/Llama-3.2-3B-Instruct-bnb-4bit"  # Replace with your actual model name
lora_adapter = "Braszczynski/Llama-3.2-3B-Instruct-bnb-4bit-460steps"

max_seq_length = 512  # Adjust as needed
dtype = None   # Example dtype, adjust based on your setup
load_in_4bit = True     # Set to True if you want to use 4-bit quantization

# Load the model and tokenizer using FastLanguageModel
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)

# Load the adapter
model.load_adapter(lora_adapter)

# Enable native 2x faster inference
FastLanguageModel.for_inference(model)

# Optional: Initialize TextStreamer if you plan to use streaming
# text_streamer = TextStreamer(tokenizer, skip_prompt=True)

def respond(message, history, system_message, max_tokens, temperature, top_p):
    # Combine system message and chat history
    chat_history = f"{system_message}\n"
    for user_msg, bot_reply in history:
        chat_history += f"User: {user_msg}\nAssistant: {bot_reply}\n"
    chat_history += f"User: {message}\nAssistant:"

    # Apply chat template and tokenize the input
    inputs = tokenizer.apply_chat_template(
        [{"role": "user", "content": message}] if not history else [
            {"role": "system", "content": system_message}] + [
            {"role": "user", "content": msg} for msg, _ in history
        ] + [{"role": "assistant", "content": reply} for _, reply in history] + [
            {"role": "user", "content": message}
        ],
        tokenize=True,
        add_generation_prompt=True,  # Must add for generation
        return_tensors="pt",
    ).to("cuda")

    # Generate response
    outputs = model.generate(
        input_ids=inputs["input_ids"],
        max_new_tokens=max_tokens,
        temperature=temperature,
        top_p=top_p,
        pad_token_id=tokenizer.eos_token_id,
        use_cache=True
        # streamer=text_streamer  # Uncomment if using streaming
    )

    # Decode and format the output
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    response = response[len(chat_history):].strip()  # Remove input context from output
    return response

# Define the Gradio interface
demo = gr.ChatInterface(
    respond,
    additional_inputs=[
        gr.Textbox(value="You are a friendly assistant.", label="System message"),
        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
        gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"),
    ],
)

if __name__ == "__main__":
    demo.launch()