|
import torch |
|
import torchvision.transforms as transforms |
|
from PIL import Image |
|
from transformers import AutoModel, AutoTokenizer |
|
|
|
|
|
model_name = 'pytorch/vision:v0.9.0' |
|
model = AutoModel.from_pretrained(model_name) |
|
|
|
|
|
model.eval() |
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
|
|
|
|
preprocess = transforms.Compose([ |
|
transforms.Resize(256), |
|
transforms.CenterCrop(224), |
|
transforms.ToTensor(), |
|
transforms.Normalize(mean=[0.485, 0.456, 0.406], |
|
std=[0.229, 0.224, 0.225]) |
|
]) |
|
|
|
|
|
def extract_features(image_path, model, tokenizer, preprocess): |
|
|
|
image = Image.open(image_path).convert('RGB') |
|
|
|
image = preprocess(image) |
|
|
|
image = image.unsqueeze(0) |
|
|
|
inputs = tokenizer(image, padding=True, truncation=True, return_tensors='pt') |
|
|
|
outputs = model(**inputs) |
|
|
|
return outputs.last_hidden_state.squeeze().detach().numpy() |
|
|
|
|
|
features_dict = {} |
|
|
|
|
|
for image_name in image_names: |
|
|
|
image_path = os.path.join(images_folder, image_name) |
|
features = extract_features(image_path, model, tokenizer, preprocess) |
|
|
|
features_dict[image_name] = features |