|
# CoreML Conversion of the mxbai-embed-large-v1 sentence embedding model |
|
|
|
After extensive testing (and a lot of debugging with ChatGPT), I was able to convert the mxbai-embed-large-v1 model to CoreML and run it mostly on the GPU. |
|
|
|
```Python3 |
|
import torch |
|
from transformers import AutoModel, AutoTokenizer |
|
import coremltools as ct |
|
|
|
# Define a wrapper class for the AutoModel to return only the last_hidden_state |
|
class ModelWrapper(torch.nn.Module): |
|
def __init__(self, model): |
|
super(ModelWrapper, self).__init__() |
|
self.model = model |
|
|
|
def forward(self, input_ids, attention_mask): |
|
# Extract the 'last_hidden_state' from the model output |
|
output = self.model(input_ids=input_ids, attention_mask=attention_mask) |
|
return output.last_hidden_state # or use 'pooler_output' if needed |
|
|
|
# Load your SentenceTransformer model and tokenizer |
|
model_name = "mixedbread-ai/mxbai-embed-large-v1" # Replace with your model |
|
model = AutoModel.from_pretrained(model_name) |
|
model.eval() |
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
|
|
# Wrap the model to return only the tensor output |
|
wrapped_model = ModelWrapper(model) |
|
wrapped_model.eval() |
|
|
|
# Sample input to export the model |
|
dummy_input = tokenizer("This is a sample input", return_tensors="pt") |
|
|
|
# Trace the model using tensor inputs (input_ids, attention_mask) |
|
traced_model = torch.jit.trace(wrapped_model, (dummy_input['input_ids'], dummy_input['attention_mask'])) |
|
|
|
# Convert the traced PyTorch model to CoreML using the ML Program format |
|
model_from_torch = ct.convert( |
|
traced_model, |
|
inputs=[ |
|
ct.TensorType(name="input_ids", shape=(1, ct.RangeDim(1, 512)), dtype=np.float32), |
|
ct.TensorType(name="attention_mask", shape=(1, ct.RangeDim(1, 512)), dtype=np.float32) |
|
], |
|
minimum_deployment_target=ct.target.iOS17, |
|
convert_to="mlprogram", |
|
compute_precision=ct.precision.FLOAT16 |
|
) |
|
|
|
# Save the CoreML model as an mlpackage |
|
model_from_torch.save("mxbai-embed-large-v1.mlpackage") |
|
``` |
|
|
|
|
|
It can be run like this: |
|
```Python |
|
import coremltools as ct |
|
from transformers import AutoTokenizer |
|
import numpy as np |
|
|
|
# Load the CoreML model |
|
model = ct.models.MLModel("mxbai-embed-large-v1.mlpackage") |
|
|
|
# Load the tokenizer |
|
tokenizer = AutoTokenizer.from_pretrained("mixedbread-ai/mxbai-embed-large-v1") |
|
|
|
# Prepare some input text |
|
input_text = "This is a test sentence for the CoreML model" |
|
inputs = tokenizer(input_text, return_tensors="np", padding=True, truncation=True, max_length=512) |
|
|
|
# Extract input tensors |
|
input_ids = inputs['input_ids'].astype(np.float32) # CoreML expects float32 |
|
attention_mask = inputs['attention_mask'].astype(np.float32) |
|
|
|
# Prepare inputs for the CoreML model |
|
coreml_input = {"input_ids": input_ids, "attention_mask": attention_mask} |
|
|
|
predictions = model.predict(coreml_input) |
|
|
|
hidden_states = predictions['hidden_states'] |
|
cls_embedding = hidden_states[0, 0, :] |
|
np.set_printoptions(threshold=np.inf) |
|
|
|
# Print the CLS token embedding, which is a 1024-dimensional vector |
|
print("CLS Token Embedding:", cls_embedding, len(cls_embedding)) |
|
``` |
|
|
|
I verified the output with ollama: |
|
|
|
``` |
|
curl http://localhost:11434/api/embeddings -d '{ |
|
"model": "mxbai-embed-large", |
|
"prompt": "This is a test sentence for the CoreML model" |
|
}' |
|
``` |
|
|
|
Environment: Python 3.11 |
|
coremltools 8.0 |
|
sentence-transformers 3.1.0 |
|
transformers 4.44.2 |
|
|