# NoteBook for HW3

Please read `README.md` for instructions on how to set up the python environment.

## Import packages you need

In [None]:
# Import necessary packages.
import random
import numpy as np
import torch
import torch.nn as nn
import torchvision.transforms as transforms
from PIL import Image
from torch.utils.data import ConcatDataset, DataLoader, Dataset
from torchvision.datasets import DatasetFolder
from tqdm import tqdm
from PIL import ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True
random.seed(1234)
np.random.seed(1234)
torch.manual_seed(1234)

## Dataset, Data Loader, and Transforms
Torchvision provides lots of useful utilities for image preprocessing, data wrapping as well as data augmentation.

Here, since our data are stored in folders by class labels, we can directly apply torchvision.datasets.DatasetFolder for wrapping data without much effort.

Please refer to PyTorch official website for details about different transforms.

In [2]:
folder = './datasets'
NUM_CLASSES = 14

In [4]:
train_tfm = transforms.Compose(
 [
 # Resize the image to a fixed shape (height = width = 256)
 transforms.Resize((256, 256)),
 transforms.Lambda(lambda x: x.convert("RGB")),
 
 # Random horizontal flip to increase robustness to object orientation
 transforms.RandomHorizontalFlip(),
 
 # Random rotation, a common transformation to handle rotated images
 transforms.RandomRotation(20), # Rotate image by a random angle between -20 and 20 degrees
 
 # Random cropping to simulate random scene zoom
 transforms.RandomResizedCrop(224, scale=(0.8, 1.0)), # Crop and resize to 224x224
 
 # Random color jitter to make the model robust to lighting changes
 transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.2),
 
 # Random affine transformation (translation, scaling, rotation)
 transforms.RandomAffine(degrees=10, translate=(0.1, 0.1), scale=(0.8, 1.2)),
 
 # Convert to tensor
 transforms.ToTensor(),
 
 # Normalize the image with mean and standard deviation for better convergence
 transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
 ]
)

test_tfm = transforms.Compose(
 [
 # Resize to the fixed size
 transforms.Resize((256, 256)),
 transforms.Lambda(lambda x: x.convert("RGB")),
 
 # Convert to tensor
 transforms.ToTensor(),
 
 # Normalize the image with mean and standard deviation (same as in training)
 transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
 ]
)

def get_dataset():
 train_set = DatasetFolder(
 folder + "/train/labeled",
 loader=lambda x: Image.open(x),
 extensions="jpg",
 # transform=train_tfm,
 )
 valid_set = DatasetFolder(
 folder + "/val",
 loader=lambda x: Image.open(x),
 extensions="jpg",
 transform=test_tfm,
 )
 unlabeled_set = DatasetFolder(
 folder + "/train/unlabeled",
 loader=lambda x: Image.open(x),
 extensions="jpg",
 # transform=train_tfm,
 )
 test_set = DatasetFolder(
 folder + "/test",
 loader=lambda x: Image.open(x),
 extensions="jpg",
 transform=test_tfm,
 )
 return train_set, valid_set, unlabeled_set, test_set

def train_collate_fn(batch):
 data, labels = zip(*batch)
 # data = torch.stack(data)
 labels = torch.tensor(labels)
 return data, labels

def test_collate_fn(batch):
 data, labels = zip(*batch)
 data = torch.stack(data)
 labels = torch.tensor(labels)
 return data, labels

## Training
* You can finish supervised learning by simply running the provided code without any modification.
* The function "get_pseudo_labels" is used for semi-supervised learning. It is expected to get better performance if you use unlabeled data for semi-supervised learning. However, you have to implement the function on your own and need to adjust several hyperparameters manually.


In [5]:
from utils import CustomDataset


def update_dataset(
 train_set, unlabeled_set, model, threshold, batch_size=128, num_workers=8
) -> Dataset:
 """
 This function generates pseudo-labels for a dataset using the given model.
 It returns an instance of DatasetFolder containing images whose prediction confidences exceed a given threshold.
 train_set, unlabeled_set = update_dataset(train_set, unlabeled_set, model)
 """
 device = "cuda" if torch.cuda.is_available() else "cpu"


 # Make sure the model is in eval mode.
 model.eval()
 # Define softmax function.
 softmax = nn.Softmax(dim=-1)

 # Create a dataloader for the unlabeled data
 unlabeled_loader = DataLoader(
 unlabeled_set,
 batch_size=batch_size,
 shuffle=False,
 num_workers=num_workers,
 pin_memory=True,
 collate_fn=train_collate_fn,
 )

 # List to store the most confident predictions
 confident_samples = []
 confident_labels = []

 with torch.no_grad():
 for batch_idx, (images, _) in enumerate(
 tqdm(unlabeled_loader, desc="Generating pseudo-labels")
 ):
 # Apply test transform to each image in the batch
 new_images = torch.stack([test_tfm(img) for img in images])

 # Forward pass through the model
 outputs = model(new_images.to(device))

 # Apply softmax to get probabilities
 probabilities = softmax(outputs)

 # Get the maximum probability and corresponding class for each sample
 max_probs, pseudo_labels = torch.max(probabilities, dim=1)

 # For each sample in the batch, check confidence threshold
 for i, (prob, label) in enumerate(zip(max_probs, pseudo_labels)):
 # If the prediction is confident enough, add to confident set
 if prob.item() > threshold:
 # Get the actual index in the unlabeled_set
 idx = batch_idx * batch_size + i
 if idx < len(unlabeled_set):
 img, _ = unlabeled_set[idx]
 confident_samples.append(img)
 confident_labels.append(int(label.cpu()))

 # Create new dataset from the confident predictions
 if confident_samples:
 pseudo_set = CustomDataset(images=confident_samples, labels=confident_labels)

 # Combine with existing labeled data
 new_set = ConcatDataset([train_set, pseudo_set])

 print(f"Added {len(confident_samples)} pseudo-labeled samples to training set")
 else:
 print("No confident pseudo-labels found.")
 new_set = train_set
 return new_set


In [6]:
from models import VGG16Classifier, ResNet50Classifier,ResNet101Classifier
from torch import optim

# "cuda" only when GPUs are available.
device = "cuda:0" if torch.cuda.is_available() else "cpu"

# Initialize a model, and put it on the device specified.
# model = Classifier().to(device)
# model = VGG16Classifier(num_classes=NUM_CLASSES).to(device)
# model = ResNet50Classifier(num_classes=NUM_CLASSES).to(device)
model = ResNet101Classifier(num_classes=NUM_CLASSES).to(device)

# For the classification task, we use cross-entropy as the measurement of performance.
criterion = nn.CrossEntropyLoss()

# Initialize optimizer, you may fine-tune some hyperparameters such as learning rate on your own.
optimizer = optim.AdamW(model.parameters(), lr=1e-4, weight_decay=1e-5)



Model loaded successfully.


In [None]:
# Whether to do semi-supervised learning.
do_semi = True
batch_size = 128
num_workers = 8
train_set, valid_set, unlabeled_set, test_set = get_dataset()

train_loader = DataLoader(
 dataset=train_set,
 batch_size=batch_size,
 shuffle=True,
 num_workers=num_workers,
 collate_fn=train_collate_fn,
)
valid_loader = DataLoader(
 dataset=valid_set,
 batch_size=batch_size,
 shuffle=True,
 num_workers=num_workers,
 collate_fn=test_collate_fn,
)
test_loader = DataLoader(
 dataset=test_set,
 batch_size=batch_size,
 shuffle=False,
 num_workers=num_workers,
 collate_fn=test_collate_fn,
)
best_valid_loss = float("inf")
start_epoch = 100
epochs = 200
threshold = 0.8
early_stop = True
for epoch in range(start_epoch, epochs):
 # ---------- TODO ----------
 # In each epoch, relabel the unlabeled dataset for semi-supervised learning.
 # Then you can combine the labeled dataset and pseudo-labeled dataset for the training.
 # if do_semi:
 if do_semi and epoch > epochs // 4 and epoch % 2 == 0:
 new_set = update_dataset(
 train_set=train_set,
 unlabeled_set=unlabeled_set,
 model=model,
 threshold=threshold,
 batch_size=batch_size,
 num_workers=num_workers,
 )
 train_loader = DataLoader(
 dataset=new_set,
 batch_size=batch_size,
 shuffle=True,
 num_workers=num_workers,
 pin_memory=True,
 collate_fn=train_collate_fn,
 )

 # ---------- Training ----------
 # Make sure the model is in train mode before training.
 model.train()

 # These are used to record information in training.
 train_loss = []
 train_accs = []

 # Iterate the training set by batches.
 for batch in tqdm(train_loader):
 # A batch consists of image data and corresponding labels.
 imgs, labels = batch

 new_images = torch.stack([train_tfm(img) for img in imgs])

 # Forward the data. (Make sure data and model are on the same device.)
 logits = model(new_images.to(device))

 # Calculate the cross-entropy loss.
 # We don't need to apply softmax before computing cross-entropy as it is done automatically.
 loss = criterion(logits, labels.to(device))

 # Gradients stored in the parameters in the previous step should be cleared out first.
 optimizer.zero_grad()

 # Compute the gradients for parameters.
 loss.backward()

 # Clip the gradient norms for stable training.
 # grad_norm = nn.utils.clip_grad_norm_(model.parameters(), max_norm=10)

 # Update the parameters with computed gradients.
 optimizer.step()

 # Compute the accuracy for current batch.
 acc = (logits.argmax(dim=-1) == labels.to(device)).float().mean()

 # Record the loss and accuracy.
 train_loss.append(loss.item())
 train_accs.append(acc)

 # The average loss and accuracy of the training set is the average of the recorded values.
 train_loss = sum(train_loss) / len(train_loss)
 train_acc = sum(train_accs) / len(train_accs)

 # Print the information.
 print(
 f"[ Train | {epoch + 1:03d}/{epochs:03d} ] loss = {train_loss:.5f}, acc = {train_acc:.5f}"
 )
 # ---------- Validation ----------
 # Make sure the model is in eval mode so that some modules like dropout are disabled and work normally.
 model.eval()

 # These are used to record information in validation.
 valid_loss = []
 valid_accs = []

 # Iterate the validation set by batches.
 for batch in tqdm(valid_loader):
 # A batch consists of image data and corresponding labels.
 imgs, labels = batch

 # We don't need gradient in validation.
 # Using torch.no_grad() accelerates the forward process.
 with torch.no_grad():
 logits = model(imgs.to(device))

 # We can still compute the loss (but not the gradient).
 loss = criterion(logits, labels.to(device))

 # Compute the accuracy for current batch.
 acc = (logits.argmax(dim=-1) == labels.to(device)).float().mean()

 # Record the loss and accuracy.
 valid_loss.append(loss.item())
 valid_accs.append(acc)

 # The average loss and accuracy for entire validation set is the average of the recorded values.
 valid_loss = sum(valid_loss) / len(valid_loss)
 valid_acc = sum(valid_accs) / len(valid_accs)

 # Print the information.
 print(
 f"[ Valid | {epoch + 1:03d}/{epochs:03d} ] loss = {valid_loss:.5f}, acc = {valid_acc:.5f}"
 )

 if valid_loss < best_valid_loss:
 best_valid_loss = valid_loss
 torch.save(model.state_dict(), "best_model.pth")
 print(f"Model saved with loss: {valid_loss:.5f}, acc: {valid_acc:.5f}")
 elif early_stop:
 if epoch > epochs // 2 and valid_loss > best_valid_loss * 1.2:
 print("Early stopping")
 break

Generating pseudo-labels: 0%| | 0/22 [00:00, ?it/s]

Generating pseudo-labels: 100%|██████████| 22/22 [00:08<00:00, 2.52it/s]


Added 2148 pseudo-labeled samples to training set


100%|██████████| 71/71 [01:20<00:00, 1.13s/it]


[ Train | 101/200 ] loss = 0.29985, acc = 0.89466


100%|██████████| 1/1 [00:00<00:00, 1.40it/s]


[ Valid | 101/200 ] loss = 0.70127, acc = 0.80612
Model saved with loss: 0.70127, acc: 0.80612


100%|██████████| 71/71 [01:19<00:00, 1.12s/it]


[ Train | 102/200 ] loss = 0.61288, acc = 0.80230


100%|██████████| 1/1 [00:00<00:00, 1.31it/s]


[ Valid | 102/200 ] loss = 0.45687, acc = 0.83673
Model saved with loss: 0.45687, acc: 0.83673


Generating pseudo-labels: 100%|██████████| 22/22 [00:08<00:00, 2.57it/s]


Added 2116 pseudo-labeled samples to training set


100%|██████████| 70/70 [01:20<00:00, 1.15s/it]


[ Train | 103/200 ] loss = 0.32189, acc = 0.89123


100%|██████████| 1/1 [00:00<00:00, 1.28it/s]


[ Valid | 103/200 ] loss = 0.53417, acc = 0.87755


100%|██████████| 70/70 [01:19<00:00, 1.13s/it]


[ Train | 104/200 ] loss = 0.50316, acc = 0.83342


100%|██████████| 1/1 [00:00<00:00, 1.17it/s]

[ Valid | 104/200 ] loss = 0.64077, acc = 0.82653
Early stopping





In [None]:
# torch.save(model.state_dict(), "best_model.pth")

: 

: 

In [8]:
# LOAD BEST MODEL
model.load_state_dict(torch.load("best_model.pth"))
model.eval()

ResNet101Classifier(
 (resnet): ResNet(
 (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
 (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
 (relu): ReLU(inplace=True)
 (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
 (layer1): Sequential(
 (0): Bottleneck(
 (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
 (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
 (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
 (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
 (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
 (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
 (relu): ReLU(inplace=True)
 (downsample): Sequential(
 (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
 (1): B

## Testing
For inference, we need to make sure the model is in eval mode, and the order of the dataset should not be shuffled ("shuffle=False" in test_loader).

In [9]:
# Make sure the model is in eval mode.
# Some modules like Dropout or BatchNorm affect if the model is in training mode.
# model.load_state_dict(torch.load("model.ckpt", map_location=lambda storage, loc: storage))
# model.device = device
model.eval()

# Initialize a list to store the predictions.
predictions = []


# Iterate the testing set by batches.
for batch in tqdm(test_loader):
 # A batch consists of image data and corresponding labels.
 # But here the variable "labels" is useless since we do not have the ground-truth.
 # If printing out the labels, you will find that it is always 0.
 # This is because the wrapper (DatasetFolder) returns images and labels for each batch,
 # so we have to create fake labels to make it work normally.
 imgs, labels = batch

 # We don't need gradient in testing, and we don't even have labels to compute loss.
 # Using torch.no_grad() accelerates the forward process.
 with torch.no_grad():
 logits = model(imgs.to(device))

 # Take the class with greatest logit as prediction and record it.
 predictions.extend(logits.argmax(dim=-1).cpu().numpy().tolist())

100%|██████████| 33/33 [00:10<00:00, 3.27it/s]


In [None]:
# Save predictions into the file.
with open("predict.csv", "w") as f:

 # The first row must be "Id, Category"
 f.write("Id,Category\n")

 # For the rest of the rows, each image id corresponds to a predicted class.
 for i, pred in enumerate(predictions):
 f.write(f"{i},{pred}\n")