File size: 2,522 Bytes
5fd0c28 f613acc 5fd0c28 d67d04a 5ccb54c 8a91905 5ccb54c f613acc d67d04a f613acc 5ccb54c d67d04a f613acc 5fd0c28 f613acc 5fd0c28 f613acc 5fd0c28 f613acc 5fd0c28 f613acc 5fd0c28 f613acc f84cd21 f613acc 5ccb54c f613acc 5ccb54c f613acc 5fd0c28 f613acc 5fd0c28 f613acc 5fd0c28 f613acc 5fd0c28 f613acc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 |
import gradio as gr
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.
# Force the model to run on CPU only by setting the device to "cpu"
device = "cpu"
# Load model and tokenizer with the device set to "cpu"
model, tokenizer = FastLanguageModel.from_pretrained(
model_name="llama_lora_model_1",
max_seq_length=max_seq_length,
dtype=dtype,
load_in_4bit=load_in_4bit,
)
# Move the model to CPU (even if it was initially loaded with GPU support)
model.to(device)
# Respond function
def respond(
message,
history: list[tuple[str, str]],
system_message,
max_tokens,
temperature,
top_p,
):
# Prepare the system message
messages = [{"role": "system", "content": system_message}]
# Add history to the messages
for val in history:
if val[0]:
messages.append({"role": "user", "content": val[0]})
if val[1]:
messages.append({"role": "assistant", "content": val[1]})
# Add the current message from the user
messages.append({"role": "user", "content": message})
# Prepare the inputs for the model
inputs = tokenizer.apply_chat_template(
messages,
tokenize=True,
add_generation_prompt=True,
return_tensors="pt",
)
# Generate the response using your model on CPU
outputs = model.generate(
input_ids=inputs["input_ids"].to(device), # Ensure input is on the CPU
max_new_tokens=max_tokens,
temperature=temperature,
top_p=top_p,
use_cache=True,
)
# Decode the generated output
response = tokenizer.batch_decode(outputs, skip_special_tokens=True)
# Return the response
return response[0]
# Gradio interface setup
demo = gr.ChatInterface(
respond,
additional_inputs=[
gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
gr.Slider(
minimum=0.1,
maximum=1.0,
value=0.95,
step=0.05,
label="Top-p (nucleus sampling)",
),
],
)
if __name__ == "__main__":
demo.launch()
|