import torch import os from transformers import AutoModelForCausalLM, AutoTokenizer # Load the local language model model_path = "export/model" model = AutoModelForCausalLM.from_pretrained(model_path) tokenizer = AutoTokenizer.from_pretrained(model_path) # Define the device to load the model onto device = "cuda" if torch.cuda.is_available() else "cpu" model.to(device) # Define a function to generate response def generate_response(prompt): messages = [ {"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": prompt} ] text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) model_inputs = tokenizer([text], return_tensors="pt").to(device) generated_ids = model.generate( model_inputs.input_ids, max_new_tokens=512 ) generated_ids = [output_ids[len(model_inputs.input_ids):] for output_ids in generated_ids] response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] return response # Main loop for chatting while True: prompt = input("You: ") if prompt.lower() == "exit": print("Goodbye!") break response = generate_response(prompt) print("Bot:", response)