from transformers import AutoModelForCausalLM, AutoTokenizer model = None tokenizer = None def setup(): global model global tokenizer model = AutoModelForCausalLM.from_pretrained( "/data/Llama-3.2-1B-Instruct", # Updated path for Inference Endpoints load_in_8bit=True, # Use 8-bit quantization for efficiency, adjust if needed device_map="auto", # Let HF determine optimal device placement ) tokenizer = AutoTokenizer.from_pretrained("/data/Llama-3.2-1B-Instruct") # Updated path def generate(inputs): global model global tokenizer input_ids = tokenizer(inputs, return_tensors="pt").input_ids.to(model.device) output = model.generate(input_ids) decoded_output = tokenizer.decode(output[0], skip_special_tokens=True) return decoded_output