File size: 2,606 Bytes
037a015
1ffa7d1
94d5aca
ce0d45f
a9938e0
 
 
1ccc577
a9938e0
94d5aca
 
a9938e0
94d5aca
 
 
32ab136
94d5aca
 
 
 
 
 
 
ce0d45f
1ffa7d1
b42ac71
 
 
 
 
e1c82eb
 
 
 
a9938e0
e1c82eb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b42ac71
e1c82eb
b42ac71
037a015
94d5aca
 
 
b42ac71
037a015
 
 
b42ac71
037a015
 
b42ac71
037a015
 
 
 
94d5aca
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import gradio as gr
import torch
from transformers import AutoTokenizer, AutoAdapterModel, TextStreamer

# Configuration Variables
model_name = "unsloth/Llama-3.2-3B-Instruct-bnb-4bit"  # Replace with your actual model name
lora_adapter = "Braszczynski/Llama-3.2-3B-Instruct-bnb-4bit-460steps"

max_seq_length = 512  # Adjust as needed
dtype = None          # Example dtype, adjust based on your setup
load_in_4bit = True   # Set to True if you want to use 4-bit quantization

# Dynamically select device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Conditional import based on GPU availability
if device.type == "cuda":
    from unsloth import FastLanguageModel
    model = AutoAdapterModel.from_pretrained(model_name, low_cpu_mem_usage=True).to(device)
    model.load_adapter(lora_adapter)
else:
    raise RuntimeError("No CUDA GPU available. Please ensure your Space has GPU enabled.")

def respond(message, history, system_message, max_tokens, temperature, top_p):
    # Combine system message and chat history
    chat_history = f"{system_message}\n"
    for user_msg, bot_reply in history:
        chat_history += f"User: {user_msg}\nAssistant: {bot_reply}\n"
    chat_history += f"User: {message}\nAssistant:"
    
    # Prepare the input for the model
    inputs = tokenizer(
        chat_history,
        return_tensors="pt",
        truncation=True,
        max_length=max_seq_length,
    ).to(device)
    
    # Generate the response
    with torch.no_grad():
        outputs = model.generate(
            input_ids=inputs["input_ids"],
            max_new_tokens=max_tokens,
            temperature=temperature,
            top_p=top_p,
            pad_token_id=tokenizer.eos_token_id,
            use_cache=True
        )
    
    # Decode and format the response
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    response = response[len(chat_history):].strip()  # Remove the input context
    return response

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

# Define the Gradio interface
demo = gr.ChatInterface(
    respond,
    additional_inputs=[
        gr.Textbox(value="You are a friendly assistant.", label="System message"),
        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
        gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"),
    ],
)

if __name__ == "__main__":
    demo.launch()