Compiled with: ```json compiler_args = {"num_cores": 16, "auto_cast_type": auto_cast_type} input_shapes = {"batch_size": 8, "sequence_length": 32} ``` Usage: ```python import torch from optimum.neuron import NeuronModelForCausalLM from transformers import AutoTokenizer model_id="deepseek-ai/DeepSeek-R1-Distill-Llama-70B" prompt="What is is the capital of France?" tokenizer = AutoTokenizer.from_pretrained(model_id) tokenizer.pad_token_id = tokenizer.eos_token_id if tokenizer.pad_token_id is None else tokenizer.pad_token_id inputs = tokenizer(prompt, return_tensors="pt") model = NeuronModelForCausalLM.from_pretrained("yahavb/DeepSeek-R1-Distill-Llama-70B-Neuron") outputs = model.generate(**inputs,max_new_tokens=512,do_sample=True,use_cache=True,temperature=0.7,top_k=50,top_p=0.9) outputs=outputs[0, inputs.input_ids.size(-1):] response=tokenizer.decode(outputs, skip_special_tokens=True) print(response)