|
import torch |
|
from transformers import AutoModelForCausalLM, AutoTokenizer |
|
from peft import PeftModel |
|
import gradio as gr |
|
|
|
|
|
base_model_name = "unsloth/llama-3.2-3b-instruct-bnb-4bit" |
|
tokenizer = AutoTokenizer.from_pretrained(base_model_name, use_fast=False) |
|
base_model = AutoModelForCausalLM.from_pretrained( |
|
base_model_name, |
|
device_map="auto", |
|
torch_dtype=torch.float16 |
|
) |
|
|
|
|
|
adapter_path = "Grandediw/lora_model" |
|
model = PeftModel.from_pretrained(base_model, adapter_path) |
|
model.eval() |
|
|
|
|
|
def respond( |
|
message, |
|
history: list[tuple[str, str]], |
|
max_tokens, |
|
temperature, |
|
top_p, |
|
): |
|
|
|
context = "" |
|
for user_message, assistant_message in history: |
|
context += f"User: {user_message}\nAssistant: {assistant_message}\n" |
|
context += f"User: {message}\nAssistant:" |
|
|
|
|
|
inputs = tokenizer(context, return_tensors="pt").to("cuda") |
|
|
|
|
|
outputs = model.generate( |
|
input_ids=inputs.input_ids, |
|
max_new_tokens=max_tokens, |
|
temperature=temperature, |
|
top_p=top_p, |
|
do_sample=True |
|
) |
|
|
|
|
|
response = tokenizer.decode(outputs[:, inputs.input_ids.shape[-1]:][0], skip_special_tokens=True) |
|
return response |
|
|
|
|
|
demo = gr.ChatInterface( |
|
fn=respond, |
|
additional_inputs=[ |
|
gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max Tokens"), |
|
gr.Slider(minimum=0.1, maximum=4.0, value=1.5, step=0.1, label="Temperature"), |
|
gr.Slider(minimum=0.1, maximum=1.0, value=0.9, step=0.05, label="Top-p"), |
|
], |
|
) |
|
|
|
if __name__ == "__main__": |
|
demo.launch() |
|
|