import os import torch import gradio as gr from peft import PeftModel, PeftConfig from transformers import AutoModelForCausalLM, AutoTokenizer def load_model(model_path): config = PeftConfig.from_pretrained(model_path) base_model = AutoModelForCausalLM.from_pretrained( config.base_model_name_or_path, trust_remote_code=True, token=os.environ["HF_TOKEN"], ) base_model.config.use_cache = False tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path) tokenizer.pad_token = tokenizer.eos_token # Load the Lora model model = PeftModel.from_pretrained( base_model, model_path, low_cpu_mem_usage=True ) return model, tokenizer def generate_text(prompt): prompt = ": " + prompt + " :" batch = tokenizer( prompt, padding=True, truncation=True, return_tensors='pt' ) batch = batch.to(device) with torch.amp.autocast(device): output_tokens = model.generate( input_ids = batch.input_ids, max_new_tokens=200, temperature=0.7, top_p=0.7, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id, eos_token_id=tokenizer.eos_token_id, ) generated_text = tokenizer.decode(output_tokens[0], skip_special_tokens=True) return generated_text.split(": ")[1].split(": ")[-1] device = 'cuda' if torch.cuda.is_available() else 'cpu' model, tokenizer = load_model(os.path.join(os.getcwd(), "weights")) iface = gr.Interface( fn=generate_text, inputs=[ gr.Textbox(label="Prompt", placeholder="Enter your prompt here...", lines=5 ), ], outputs=gr.Textbox(label="Generated Text"), title="LLaMA-3.2-3B-Instruct-QLoRA", description="LLaMA-3.2-3B-Instruct Finetuned using QLoRA on OpenAssistant/oasst1", examples=[ ["can you describe winter?"], ["How about we play a fun game?"], ] ) if __name__ == "__main__": iface.launch()