import gradio as gr from transformers import AutoTokenizer, FastLanguageModel # Load the model and tokenizer model, tokenizer = FastLanguageModel.from_pretrained( model_name="lora_model", # Replace with your trained model name max_seq_length=512, dtype="float16", load_in_4bit=True, ) FastLanguageModel.for_inference(model) # Define the inference function def generate_response(user_input): # Prepare the input for the model labeled_prompt = ( "Please provide the response with the following labels:\n" f"User Input: {user_input}\n" "Response:" ) inputs = tokenizer( [labeled_prompt], return_tensors="pt", padding=True, truncation=True, max_length=512, ).to("cuda") response = model.generate(input_ids=inputs.input_ids, attention_mask=inputs.attention_mask, max_new_tokens=100, pad_token_id=tokenizer.eos_token_id) return tokenizer.decode(response[0], skip_special_tokens=True) # Create a Gradio interface iface = gr.Interface(fn=generate_response, inputs="text", outputs="text", title="Chatbot Interface", description="Enter your message below:") # Launch the app iface.launch()