File size: 2,468 Bytes
5fd0c28
f613acc
 
5fd0c28
d67d04a
 
 
 
 
 
8a91905
f613acc
d67d04a
 
 
 
f613acc
 
d67d04a
 
f613acc
5fd0c28
 
 
 
 
 
 
 
f613acc
5fd0c28
 
f613acc
5fd0c28
 
 
 
 
 
f613acc
5fd0c28
 
f613acc
 
5fd0c28
f613acc
 
 
f84cd21
f613acc
 
 
d67d04a
f613acc
5fd0c28
 
f613acc
 
 
 
 
5fd0c28
f613acc
 
5fd0c28
 
f613acc
5fd0c28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f613acc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import gradio as gr
from unsloth import FastLanguageModel
import torch

max_seq_length = 2048  # Choose any! We auto support RoPE Scaling internally!
dtype = None  # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True  # Use 4bit quantization to reduce memory usage. Can be False.

# Check for GPU availability and use the appropriate device
device = "cuda" if torch.cuda.is_available() else "cpu"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="llama_lora_model_1",
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)

model.to(device)  # Move model to the appropriate device

# Respond function
def respond(
    message,
    history: list[tuple[str, str]],
    system_message,
    max_tokens,
    temperature,
    top_p,
):
    # Prepare the system message
    messages = [{"role": "system", "content": system_message}]

    # Add history to the messages
    for val in history:
        if val[0]:
            messages.append({"role": "user", "content": val[0]})
        if val[1]:
            messages.append({"role": "assistant", "content": val[1]})

    # Add the current message from the user
    messages.append({"role": "user", "content": message})

    # Prepare the inputs for the model
    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt",
    )

    # Generate the response using your model
    outputs = model.generate(
        input_ids=inputs["input_ids"].to(device),  # Ensure input is on the correct device
        max_new_tokens=max_tokens,
        temperature=temperature,
        top_p=top_p,
        use_cache=True,
    )

    # Decode the generated output
    response = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    # Return the response
    return response[0]


# Gradio interface setup
demo = gr.ChatInterface(
    respond,
    additional_inputs=[
        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
        gr.Slider(
            minimum=0.1,
            maximum=1.0,
            value=0.95,
            step=0.05,
            label="Top-p (nucleus sampling)",
        ),
    ],
)

if __name__ == "__main__":
    demo.launch()