In [None]:
# !pip install pytorch 
# !pip install intel-extension-for-pytorch
# !pip install transformers
# !pip install datasets
# !pip install onnxruntime
# !pip install neural_compressor

In [None]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
from datasets import Dataset
from transformers import Trainer, TrainingArguments
import torch
from torch.utils.data import DataLoader
import intel_extension_for_pytorch as ipex
import json

# Load pre-trained FLAN-T5 model and tokenizer
model_name = "google/flan-t5-large" # FLAN-T5 Base Model
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)
optimized_model = ipex.optimize(model, dtype=torch.float32)
# Example input-output pair for fine-tuning
data = json.load("t5train.json")

# Convert the data to a Hugging Face dataset
dataset = Dataset.from_dict(data)
dataloader = DataLoader(dataset, num_workers=4, pin_memory=True)
# Tokenize the data
def preprocess_function(examples):
 model_inputs = tokenizer(examples['input_text'], padding="max_length", truncation=True, max_length=2048)
 labels = tokenizer(examples['output_text'], padding="max_length", truncation=True, max_length=2048)
 model_inputs['labels'] = labels['input_ids']
 return model_inputs

In [None]:
tokenized_datasets = dataset.map(preprocess_function, batched=True)

# Set up the training arguments
training_args = TrainingArguments(
 output_dir="./flan_t5_results", # Output directory for model checkpoints
 eval_strategy="epoch", # Evaluation strategy to use
 learning_rate=2e-5, # Learning rate for fine-tuning
 per_device_train_batch_size=1, # Batch size for training
 num_train_epochs=1, # Number of epochs
 weight_decay=0.01, # Weight decay for regularization
 save_steps=10, # Save model every 10 steps
 save_total_limit=1, # Limit the number of saved models
 fp16=False, # Disable mixed precision
 use_cpu=True # Force CPU-only training
)

# Initialize the Trainer class
trainer = Trainer(
 model=optimized_model,
 args=training_args,
 train_dataset=tokenized_datasets,
 eval_dataset=tokenized_datasets # Use the same dataset for evaluation since we only have one data point
)

# Start training (this will fine-tune the model on the given example)
trainer.train()

# Save the fine-tuned model
#trainer.save_model("./flan_t5_finetuned")
optimized_model.save_pretrained("./flan_t5_finetuned")
tokenizer.save_pretrained("./flan_t5_finetuned")

# Evaluate the model on the training data (for a single example)
optimized_model.eval()

In [None]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
from neural_compressor.quantization import fit
from neural_compressor.config import PostTrainingQuantConfig

# Load your FP32 model
model_path = "./flan_t5_finetuned"
optimized_model = T5ForConditionalGeneration.from_pretrained(model_path)
tokenizer = T5Tokenizer.from_pretrained(model_path)

# Define the quantization configuration
quant_config = PostTrainingQuantConfig(approach='dynamic') # Dynamic quantization

# Quantize the model
q_model = fit(model=optimized_model, conf=quant_config)

# Save the quantized model
quantized_model_path = "./flan_t5_quantized_fp16"
q_model.save_pretrained(quantized_model_path)
tokenizer.save_pretrained(quantized_model_path)

print(f"Quantized model saved at: {quantized_model_path}")

In [None]:
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer
import os

# Load the FP16 model
model_path = "./flan_t5_fp16"
model = T5ForConditionalGeneration.from_pretrained(model_path)
tokenizer = T5Tokenizer.from_pretrained(model_path)

# Set the model to evaluation mode
model.eval()

# Example input text
input_text = "Translate English to French: How are you?"
inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True)

# Prepare decoder input: token is used as the first decoder input
decoder_start_token_id = tokenizer.pad_token_id
decoder_input_ids = torch.tensor([[decoder_start_token_id]])

# Create output directory if it doesn't exist
onnx_output_dir = "./flant5"
os.makedirs(onnx_output_dir, exist_ok=True)

# Define the path for the ONNX model
onnx_model_path = os.path.join(onnx_output_dir, "flan_t5_fp16.onnx")

# Export the model to ONNX
torch.onnx.export(
 model, # Model to be converted
 (inputs["input_ids"], inputs["attention_mask"], decoder_input_ids), # Input tuple
 onnx_model_path, # Path to save the ONNX model
 export_params=True, # Store the trained parameters
 opset_version=13, # ONNX version
 do_constant_folding=True, # Optimize constants
 input_names=["input_ids", "attention_mask", "decoder_input_ids"], # Input tensor names
 output_names=["output"], # Output tensor name
 dynamic_axes={ # Dynamic shapes for batching
 "input_ids": {0: "batch_size", 1: "sequence_length"},
 "attention_mask": {0: "batch_size", 1: "sequence_length"},
 "decoder_input_ids": {0: "batch_size", 1: "sequence_length"},
 "output": {0: "batch_size", 1: "sequence_length"}
 }
)

print(f"ONNX model saved at: {onnx_model_path}")

In [None]:
import onnxruntime as ort
import numpy as np
from transformers import T5Tokenizer

# Load the ONNX model and tokenizer
onnx_model_path = "./flan_t5_fp16.onnx"
tokenizer = T5Tokenizer.from_pretrained("./flan_t5_fp16")
ort_session = ort.InferenceSession(onnx_model_path)

# Input text for the model
input_text = "Translate English to French: How are you?"
inputs = tokenizer(input_text, return_tensors="np", padding=True, truncation=True)

# Ensure inputs are numpy arrays
input_ids = np.array(inputs["input_ids"], dtype=np.int64)
attention_mask = np.array(inputs["attention_mask"], dtype=np.int64)

# Prepare the decoder input ( token for initial input to the decoder)
decoder_start_token_id = tokenizer.pad_token_id
decoder_input_ids = np.array([[decoder_start_token_id]], dtype=np.int64)

# ONNX model inputs
onnx_inputs = {
 "input_ids": input_ids,
 "attention_mask": attention_mask,
 "decoder_input_ids": decoder_input_ids
}

# Run the ONNX model
onnx_outputs = ort_session.run(None, onnx_inputs)

# Convert logits to token IDs
logits = onnx_outputs[0] # Shape: [batch_size, sequence_length, vocab_size]
token_ids = np.argmax(logits, axis=-1) # Get token IDs with the highest scores

# Decode the token IDs into text
decoded_output = tokenizer.decode(token_ids[0], skip_special_tokens=True)

print(f"ONNX Model Output: {decoded_output}")
