import torch
import torchvision.transforms as transforms
from PIL import Image
from transformers import AutoModel, AutoTokenizer

# Load the pre-trained ResNet50 model from Hugging Face
model_name = 'pytorch/vision:v0.9.0'
model = AutoModel.from_pretrained(model_name)

# Set the model to evaluation mode
model.eval()

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Define the preprocessing pipeline
preprocess = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])

# Define a function to extract features from an image
def extract_features(image_path, model, tokenizer, preprocess):
    # Load the image
    image = Image.open(image_path).convert('RGB')
    # Apply the preprocessing pipeline
    image = preprocess(image)
    # Add a batch dimension to the image tensor
    image = image.unsqueeze(0)
    # Encode the image using the tokenizer
    inputs = tokenizer(image, padding=True, truncation=True, return_tensors='pt')
    # Pass the inputs through the model to get the features
    outputs = model(**inputs)
    # Return the features
    return outputs.last_hidden_state.squeeze().detach().numpy()

# Define a dictionary to store the features
features_dict = {}

# Loop over the images and extract the features
for image_name in image_names:
    # Extract the features for this image
    image_path = os.path.join(images_folder, image_name)
    features = extract_features(image_path, model, tokenizer, preprocess)
    # Add the features to the dictionary
    features_dict[image_name] = features