clip-variants / convert.py
mlunar's picture
Add shape inference for textual models
b98d24d
raw
history blame
3.28 kB
from cgitb import text
import os
import clip
import torch.onnx
import torch
from torch import nn
from multiprocessing import Pool
class TextTransformer(nn.Module):
def __init__(self, clip_model):
super().__init__()
self.clip_model = clip_model
def forward(self, x: torch.Tensor):
return self.clip_model.encode_text(x)
def export(model, input, path):
print(f"Exporting to {path}")
torch.onnx.export(
model, # model being run
input, # model input (or a tuple for multiple inputs)
path, # where to save the model (can be a file or file-like object)
export_params=True, # store the trained parameter weights inside the model file
opset_version=16, # the ONNX version to export the model to
do_constant_folding=True, # whether to execute constant folding for optimization
input_names = ['input'], # the model's input names
output_names = ['output'], # the model's output names
dynamic_axes={
'input' : {0 : 'batch_size'}, # variable length axes
'output' : {0 : 'batch_size'}
}
)
def convert(model_name, dashed_name):
visual_path = f"{output_dir}/clip-{dashed_name}-visual.onnx"
textual_path = f"{output_dir}/clip-{dashed_name}-textual.onnx"
visual_exists = os.path.exists(visual_path)
textual_exists = os.path.exists(textual_path)
if visual_exists and textual_exists:
print(f"{visual_path} exists, skipping")
print(f"{textual_path} exists, skipping")
return
print(f"Model: {model_name}")
print(f"Loading CLIP")
model, _ = clip.load(model_name, device=device)
model = model.to(device=device)
if not visual_exists:
input_res = model.visual.input_resolution
export(
model.visual,
torch.rand(1, 3, input_res, input_res),
visual_path,
)
else:
print(f"{visual_path} exists, skipping")
if not textual_exists:
text_transformer = TextTransformer(model)
export(
text_transformer,
clip.tokenize(["hello onnx"]).to(device),
textual_path,
)
else:
print(f"{textual_path} exists, skipping")
device = "cuda" if torch.cuda.is_available() else "cpu"
device = "cpu"
output_dir = "converted"
if __name__ == "__main__":
print(f"Torch device: {device}")
available_models = clip.available_models()
print(f"Available models: {available_models}")
models = [
("RN50", "resnet-50"),
("RN101", "resnet-101"),
("RN50x4", "resnet-50x4"),
("RN50x16", "resnet-50x16"),
("RN50x64", "resnet-50x64"),
("RN50", "resnet-50"),
("RN50", "resnet-50"),
("RN50", "resnet-50"),
("ViT-B/16", "vit-base-patch16"),
("ViT-B/32", "vit-base-patch32"),
("ViT-L/14", "vit-large-patch14"),
("ViT-L/14@336px", "vit-large-patch14-336"),
]
print(f"Converting models: {models}")
for model in models:
convert(*model)
# For converting multiple models at once
# with Pool(1) as p:
# p.starmap(convert, models)
print("done")