Spaces:
Runtime error
Runtime error
File size: 2,606 Bytes
037a015 1ffa7d1 94d5aca ce0d45f a9938e0 1ccc577 a9938e0 94d5aca a9938e0 94d5aca 32ab136 94d5aca ce0d45f 1ffa7d1 b42ac71 e1c82eb a9938e0 e1c82eb b42ac71 e1c82eb b42ac71 037a015 94d5aca b42ac71 037a015 b42ac71 037a015 b42ac71 037a015 94d5aca |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 |
import gradio as gr
import torch
from transformers import AutoTokenizer, AutoAdapterModel, TextStreamer
# Configuration Variables
model_name = "unsloth/Llama-3.2-3B-Instruct-bnb-4bit" # Replace with your actual model name
lora_adapter = "Braszczynski/Llama-3.2-3B-Instruct-bnb-4bit-460steps"
max_seq_length = 512 # Adjust as needed
dtype = None # Example dtype, adjust based on your setup
load_in_4bit = True # Set to True if you want to use 4-bit quantization
# Dynamically select device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
# Conditional import based on GPU availability
if device.type == "cuda":
from unsloth import FastLanguageModel
model = AutoAdapterModel.from_pretrained(model_name, low_cpu_mem_usage=True).to(device)
model.load_adapter(lora_adapter)
else:
raise RuntimeError("No CUDA GPU available. Please ensure your Space has GPU enabled.")
def respond(message, history, system_message, max_tokens, temperature, top_p):
# Combine system message and chat history
chat_history = f"{system_message}\n"
for user_msg, bot_reply in history:
chat_history += f"User: {user_msg}\nAssistant: {bot_reply}\n"
chat_history += f"User: {message}\nAssistant:"
# Prepare the input for the model
inputs = tokenizer(
chat_history,
return_tensors="pt",
truncation=True,
max_length=max_seq_length,
).to(device)
# Generate the response
with torch.no_grad():
outputs = model.generate(
input_ids=inputs["input_ids"],
max_new_tokens=max_tokens,
temperature=temperature,
top_p=top_p,
pad_token_id=tokenizer.eos_token_id,
use_cache=True
)
# Decode and format the response
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
response = response[len(chat_history):].strip() # Remove the input context
return response
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
# Define the Gradio interface
demo = gr.ChatInterface(
respond,
additional_inputs=[
gr.Textbox(value="You are a friendly assistant.", label="System message"),
gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"),
],
)
if __name__ == "__main__":
demo.launch()
|