File size: 2,468 Bytes
5fd0c28 f613acc 5fd0c28 d67d04a 8a91905 f613acc d67d04a f613acc d67d04a f613acc 5fd0c28 f613acc 5fd0c28 f613acc 5fd0c28 f613acc 5fd0c28 f613acc 5fd0c28 f613acc f84cd21 f613acc d67d04a f613acc 5fd0c28 f613acc 5fd0c28 f613acc 5fd0c28 f613acc 5fd0c28 f613acc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 |
import gradio as gr
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.
# Check for GPU availability and use the appropriate device
device = "cuda" if torch.cuda.is_available() else "cpu"
model, tokenizer = FastLanguageModel.from_pretrained(
model_name="llama_lora_model_1",
max_seq_length=max_seq_length,
dtype=dtype,
load_in_4bit=load_in_4bit,
)
model.to(device) # Move model to the appropriate device
# Respond function
def respond(
message,
history: list[tuple[str, str]],
system_message,
max_tokens,
temperature,
top_p,
):
# Prepare the system message
messages = [{"role": "system", "content": system_message}]
# Add history to the messages
for val in history:
if val[0]:
messages.append({"role": "user", "content": val[0]})
if val[1]:
messages.append({"role": "assistant", "content": val[1]})
# Add the current message from the user
messages.append({"role": "user", "content": message})
# Prepare the inputs for the model
inputs = tokenizer.apply_chat_template(
messages,
tokenize=True,
add_generation_prompt=True,
return_tensors="pt",
)
# Generate the response using your model
outputs = model.generate(
input_ids=inputs["input_ids"].to(device), # Ensure input is on the correct device
max_new_tokens=max_tokens,
temperature=temperature,
top_p=top_p,
use_cache=True,
)
# Decode the generated output
response = tokenizer.batch_decode(outputs, skip_special_tokens=True)
# Return the response
return response[0]
# Gradio interface setup
demo = gr.ChatInterface(
respond,
additional_inputs=[
gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
gr.Slider(
minimum=0.1,
maximum=1.0,
value=0.95,
step=0.05,
label="Top-p (nucleus sampling)",
),
],
)
if __name__ == "__main__":
demo.launch()
|