CoreML Conversion of the mxbai-embed-large-v1 sentence embedding model
After extensive testing (and a lot of debugging with ChatGPT), I was able to convert the mxbai-embed-large-v1 model to CoreML and run it mostly on the GPU.
import torch
from transformers import AutoModel, AutoTokenizer
import coremltools as ct
# Define a wrapper class for the AutoModel to return only the last_hidden_state
class ModelWrapper(torch.nn.Module):
def __init__(self, model):
super(ModelWrapper, self).__init__()
self.model = model
def forward(self, input_ids, attention_mask):
# Extract the 'last_hidden_state' from the model output
output = self.model(input_ids=input_ids, attention_mask=attention_mask)
return output.last_hidden_state # or use 'pooler_output' if needed
# Load your SentenceTransformer model and tokenizer
model_name = "mixedbread-ai/mxbai-embed-large-v1" # Replace with your model
model = AutoModel.from_pretrained(model_name)
model.eval()
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Wrap the model to return only the tensor output
wrapped_model = ModelWrapper(model)
wrapped_model.eval()
# Sample input to export the model
dummy_input = tokenizer("This is a sample input", return_tensors="pt")
# Trace the model using tensor inputs (input_ids, attention_mask)
traced_model = torch.jit.trace(wrapped_model, (dummy_input['input_ids'], dummy_input['attention_mask']))
# Convert the traced PyTorch model to CoreML using the ML Program format
model_from_torch = ct.convert(
traced_model,
inputs=[
ct.TensorType(name="input_ids", shape=(1, ct.RangeDim(1, 512)), dtype=np.float32),
ct.TensorType(name="attention_mask", shape=(1, ct.RangeDim(1, 512)), dtype=np.float32)
],
minimum_deployment_target=ct.target.iOS17,
convert_to="mlprogram",
compute_precision=ct.precision.FLOAT16
)
# Save the CoreML model as an mlpackage
model_from_torch.save("mxbai-embed-large-v1.mlpackage")
It can be run like this:
import coremltools as ct
from transformers import AutoTokenizer
import numpy as np
# Load the CoreML model
model = ct.models.MLModel("mxbai-embed-large-v1.mlpackage")
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("mixedbread-ai/mxbai-embed-large-v1")
# Prepare some input text
input_text = "This is a test sentence for the CoreML model"
inputs = tokenizer(input_text, return_tensors="np", padding=True, truncation=True, max_length=512)
# Extract input tensors
input_ids = inputs['input_ids'].astype(np.float32) # CoreML expects float32
attention_mask = inputs['attention_mask'].astype(np.float32)
# Prepare inputs for the CoreML model
coreml_input = {"input_ids": input_ids, "attention_mask": attention_mask}
predictions = model.predict(coreml_input)
hidden_states = predictions['hidden_states']
cls_embedding = hidden_states[0, 0, :]
np.set_printoptions(threshold=np.inf)
# Print the CLS token embedding, which is a 1024-dimensional vector
print("CLS Token Embedding:", cls_embedding, len(cls_embedding))
I verified the output with ollama:
curl http://localhost:11434/api/embeddings -d '{
"model": "mxbai-embed-large",
"prompt": "This is a test sentence for the CoreML model"
}'
Environment: Python 3.11 coremltools 8.0 sentence-transformers 3.1.0 transformers 4.44.2