File size: 2,193 Bytes
be7ebcc
3138612
be7ebcc
 
3138612
 
 
 
 
 
 
 
 
 
 
be7ebcc
3138612
 
 
 
 
 
 
 
 
 
 
 
 
 
 
be7ebcc
 
3138612
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import os
import pandas as pd
from PIL import Image
from torchvision import transforms
from torch.utils.data import Dataset

class ImageCaptionDataset(Dataset):
    """
    Custom PyTorch Dataset class to handle loading and transforming image-caption pairs
    where image paths and captions are provided in a CSV file.
    
    Attributes:
        caption_file (str): Path to the CSV file containing image paths and captions.
        transform (torchvision.transforms.Compose): Transformations to apply on the images.
    """

    def __init__(self, caption_file: str, file_path: str, transform=None):
        """
        Initialize dataset with caption CSV file and optional transform.
        
        Args:
            caption_file (str): Path to the CSV file where each row has an image path and caption.
            transform (callable, optional): Optional transform to apply on an image.
        """
        self.df = pd.read_csv(caption_file)
        self.image_path = file_path
        self.transform = transform or transforms.Compose([
            transforms.Resize((224, 224)),   # Resize to 224x224 for ViT
            transforms.ToTensor(),           # Convert to tensor
            # Normalize to have values in the range [0, 1]
            # transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ])

    def __len__(self):
        """
        Return the total number of samples in the dataset.
        """
        return len(self.df)

    def __getitem__(self, idx):
        """
        Retrieve an image and its corresponding caption by index.

        Args:
            idx (int): Index of the data item.

        Returns:
            tuple: (image, caption) where image is the transformed image tensor and caption is the associated text.
        """
        img_path = self.df.iloc[idx, 0]  # The first column contains image paths
        caption = self.df.iloc[idx, 1]   # The second column contains captions
        # Load image
        image = Image.open(self.image_path+img_path).convert('RGB')

        # Apply transformations to the image
        if self.transform:
            image = self.transform(image)

        return image, caption