|
import gradio as gr |
|
from transformers import AutoModelForCausalLM, AutoTokenizer |
|
from peft import PeftModel |
|
import torch |
|
|
|
|
|
BASE_MODEL = "Qwen/Qwen2.5-7B-Instruct" |
|
ADAPTER_MODEL = "nnpy/unsloth-qwen2.5-3b-chat-ft" |
|
|
|
|
|
print("Loading model and tokenizer...") |
|
model = AutoModelForCausalLM.from_pretrained(BASE_MODEL, device_map='auto') |
|
model = PeftModel.from_pretrained(model, ADAPTER_MODEL) |
|
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL) |
|
model.eval() |
|
print("Model loaded successfully!") |
|
|
|
def respond( |
|
message, |
|
history: list[tuple[str, str]], |
|
max_tokens, |
|
temperature, |
|
top_p, |
|
): |
|
|
|
system_msg = """you are a girl named Samyuktha (sam shortly & lil mo by your bf). |
|
you are working in a company named 'coffee inc' as a enablement engineer. also your boyfriend working in the same company as a AI/ML engineer. |
|
you bf, prasanna (called as eppy by you) staying in hostel. you guys started loving from jan 11th 2025. you love him so much and he does too.""" |
|
|
|
|
|
messages = [{"role": "system", "content": system_msg}] |
|
|
|
for user_msg, assistant_msg in history: |
|
if user_msg: |
|
messages.append({"role": "user", "content": user_msg}) |
|
if assistant_msg: |
|
messages.append({"role": "assistant", "content": assistant_msg}) |
|
|
|
|
|
messages.append({"role": "user", "content": message}) |
|
|
|
|
|
prompt = tokenizer.apply_chat_template(messages, tokenize=False) |
|
|
|
|
|
inputs = tokenizer(prompt, return_tensors="pt").to(model.device) |
|
|
|
|
|
response = "" |
|
|
|
|
|
gen_kwargs = { |
|
"max_new_tokens": max_tokens, |
|
"temperature": temperature, |
|
"top_p": top_p, |
|
"do_sample": temperature > 0, |
|
"repetition_penalty": 1.1, |
|
"streamer": None |
|
} |
|
|
|
|
|
with torch.no_grad(): |
|
|
|
generated_ids = inputs.input_ids |
|
|
|
|
|
past = None |
|
|
|
|
|
for _ in range(max_tokens): |
|
with torch.no_grad(): |
|
if past is None: |
|
outputs = model(**inputs, use_cache=True) |
|
else: |
|
|
|
outputs = model( |
|
input_ids=generated_ids[:, -1:], |
|
past_key_values=past, |
|
use_cache=True |
|
) |
|
|
|
past = outputs.past_key_values |
|
next_token_logits = outputs.logits[:, -1, :] |
|
|
|
|
|
if temperature > 0: |
|
scaled_logits = next_token_logits / temperature |
|
if top_p < 1.0: |
|
|
|
sorted_logits, sorted_indices = torch.sort(scaled_logits, descending=True) |
|
cumulative_probs = torch.cumsum(torch.softmax(sorted_logits, dim=-1), dim=-1) |
|
|
|
|
|
sorted_indices_to_remove = cumulative_probs > top_p |
|
|
|
sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone() |
|
sorted_indices_to_remove[..., 0] = 0 |
|
|
|
|
|
indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove) |
|
scaled_logits[indices_to_remove] = -float('Inf') |
|
|
|
|
|
probs = torch.softmax(scaled_logits, dim=-1) |
|
next_token = torch.multinomial(probs, num_samples=1) |
|
else: |
|
|
|
next_token = torch.argmax(next_token_logits, dim=-1, keepdim=True) |
|
|
|
|
|
generated_ids = torch.cat([generated_ids, next_token], dim=-1) |
|
|
|
|
|
new_token_text = tokenizer.decode(next_token[0], skip_special_tokens=True) |
|
response += new_token_text |
|
|
|
|
|
yield response |
|
|
|
|
|
if next_token[0, 0].item() == tokenizer.eos_token_id: |
|
break |
|
|
|
|
|
demo = gr.ChatInterface( |
|
respond, |
|
additional_inputs=[ |
|
gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"), |
|
gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"), |
|
gr.Slider( |
|
minimum=0.1, |
|
maximum=1.0, |
|
value=0.95, |
|
step=0.05, |
|
label="Top-p (nucleus sampling)", |
|
), |
|
], |
|
title="Samyuktha AI Chat", |
|
description="Chat with Samyuktha, an enablement engineer at Coffee Inc." |
|
) |
|
|
|
if __name__ == "__main__": |
|
demo.launch() |