Spaces:
Runtime error
Runtime error
File size: 2,891 Bytes
037a015 1ffa7d1 a9938e0 ce0d45f a9938e0 1ccc577 a9938e0 1ccc577 a9938e0 1ccc577 1ffa7d1 a9938e0 1ccc577 a9938e0 ce0d45f 1ffa7d1 b42ac71 a9938e0 b42ac71 a9938e0 b42ac71 a9938e0 037a015 a9938e0 b42ac71 037a015 b42ac71 037a015 b42ac71 037a015 b42ac71 037a015 b42ac71 037a015 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 |
import gradio as gr
import torch
from unsloth import FastLanguageModel
from transformers import TextStreamer
# Configuration Variables
model_name = "unsloth/Llama-3.2-3B-Instruct-bnb-4bit" # Replace with your actual model name
lora_adapter = "Braszczynski/Llama-3.2-3B-Instruct-bnb-4bit-460steps"
max_seq_length = 512 # Adjust as needed
dtype = None # Example dtype, adjust based on your setup
load_in_4bit = True # Set to True if you want to use 4-bit quantization
# Load the model and tokenizer using FastLanguageModel
model, tokenizer = FastLanguageModel.from_pretrained(
model_name=model_name,
max_seq_length=max_seq_length,
dtype=dtype,
load_in_4bit=load_in_4bit,
)
# Load the adapter
model.load_adapter(lora_adapter)
# Enable native 2x faster inference
FastLanguageModel.for_inference(model)
# Optional: Initialize TextStreamer if you plan to use streaming
# text_streamer = TextStreamer(tokenizer, skip_prompt=True)
def respond(message, history, system_message, max_tokens, temperature, top_p):
# Combine system message and chat history
chat_history = f"{system_message}\n"
for user_msg, bot_reply in history:
chat_history += f"User: {user_msg}\nAssistant: {bot_reply}\n"
chat_history += f"User: {message}\nAssistant:"
# Apply chat template and tokenize the input
inputs = tokenizer.apply_chat_template(
[{"role": "user", "content": message}] if not history else [
{"role": "system", "content": system_message}] + [
{"role": "user", "content": msg} for msg, _ in history
] + [{"role": "assistant", "content": reply} for _, reply in history] + [
{"role": "user", "content": message}
],
tokenize=True,
add_generation_prompt=True, # Must add for generation
return_tensors="pt",
).to("cuda")
# Generate response
outputs = model.generate(
input_ids=inputs["input_ids"],
max_new_tokens=max_tokens,
temperature=temperature,
top_p=top_p,
pad_token_id=tokenizer.eos_token_id,
use_cache=True
# streamer=text_streamer # Uncomment if using streaming
)
# Decode and format the output
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
response = response[len(chat_history):].strip() # Remove input context from output
return response
# Define the Gradio interface
demo = gr.ChatInterface(
respond,
additional_inputs=[
gr.Textbox(value="You are a friendly assistant.", label="System message"),
gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"),
],
)
if __name__ == "__main__":
demo.launch()
|