Upload 9 files
Browse files- .gitattributes +2 -0
- best_model.txt +1 -0
- le_net_learning_mnist.py +266 -0
- lenet_mnist_model.pth +3 -0
- let_net_arch.png +0 -0
- mnist_dataset/t10k-images.idx3-ubyte +3 -0
- mnist_dataset/t10k-labels.idx1-ubyte +0 -0
- mnist_dataset/train-images.idx3-ubyte +3 -0
- mnist_dataset/train-labels.idx1-ubyte +0 -0
- utils.py +62 -0
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
36 |
mnist_dataset/t10k-images.idx3-ubyte filter=lfs diff=lfs merge=lfs -text
37 |
mnist_dataset/train-images.idx3-ubyte filter=lfs diff=lfs merge=lfs -text
@@ -0,0 +1 @@
1 |
@@ -0,0 +1,266 @@
1 |
# Rewriting the LeNet model to learn the MNIST dataset and save the model parameters,
2 |
# This is considered something we should do in Week 3 of the Deep Learning and Computer Vision course.
3 |
4 |
# We will implement LeNet-5 architecture to learn the MNIST dataset.
5 |
6 |
from torchvision.transforms import ToTensor
7 |
# from torchvision.transforms import v2
8 |
from torchvision import transforms
9 |
from torch.utils.data import DataLoader
10 |
from torch.utils.data import Dataset
11 |
from torchvision import datasets
12 |
import matplotlib.pyplot as plt
13 |
from PIL import Image
14 |
from time import time
15 |
from torch import nn
16 |
import pandas as pd
17 |
import numpy as np
18 |
import torch, os
19 |
from utils import ApplyEnhancementFilter
20 |
21 |
# Load device first (GPU or CPU)
22 |
device = (
23 |
24 |
if torch.cuda.is_available()
25 |
else "mps"
26 |
if torch.backends.mps.is_available()
27 |
else "cpu"
28 |
29 |
print(f"Using {device} device for training/inference.")
30 |
if device == "cuda":
31 |
print(f"GPU being used: {torch.cuda.get_device_name(0)}")
32 |
33 |
34 |
train_transform = transforms.Compose([
35 |
# Data augmentation transformations
36 |
# ApplyEnhancementFilter(out_channels=1, kernel_size=3, stride=1, padding=1),
37 |
transforms.RandomAffine(degrees=35, translate=(0.1, 0.1), scale=(0.9, 1.1)),
38 |
39 |
# Convert images to tensors and normalize
40 |
41 |
transforms.Normalize((0.13066047430038452,), (0.30810782313346863,)),
42 |
# Pad the image to make it 32x32
43 |
transforms.Pad(2, fill=0, padding_mode='constant'),
44 |
45 |
46 |
# For the test dataset, you should not apply these augmentations
47 |
test_transform = transforms.Compose([
48 |
49 |
transforms.Normalize((0.13066047430038452,), (0.30810782313346863,)),
50 |
transforms.Pad(2, fill=0, padding_mode='constant'),
51 |
52 |
53 |
54 |
# Load the MNIST dataset which is 32x32x1 images (black and white ~ 1 channel)
55 |
56 |
# http://yann.lecun.com/exdb/mnist/
57 |
# datasets.MNIST
58 |
59 |
# Loading from Dataset and DataLoader, https://pytorch.org/tutorials/beginner/basics/data_tutorial.html
60 |
# Load using known datasets, but what if we have our own dataset?
61 |
# training_data = datasets.MNIST(
62 |
# root="data",
63 |
# train=True,
64 |
# download=True,
65 |
# transform=ToTensor()
66 |
# )
67 |
68 |
# test_data = datasets.MNIST(
69 |
# root="data",
70 |
# train=False,
71 |
# download=True,
72 |
# transform=ToTensor()
73 |
# )
74 |
75 |
# Loading from a custom dataset
76 |
import idx2numpy
77 |
class CustomImageDataset(Dataset):
78 |
79 |
This class must inherit from the torch.utils.data.Dataset class.
80 |
And contina functions __init__, __len__, and __getitem__.
81 |
82 |
def __init__(self, annotations_file, image_file, transform=None, target_transform=None):
83 |
self.img_labels = idx2numpy.convert_from_file(annotations_file)
84 |
self.images = idx2numpy.convert_from_file(image_file)
85 |
self.transform = transform
86 |
self.target_transform = target_transform
87 |
88 |
def __len__(self):
89 |
return len(self.img_labels)
90 |
91 |
def __getitem__(self, idx):
92 |
"""Get the image and label at the index idx."""
93 |
label = self.img_labels[idx]
94 |
img = self.images[idx]
95 |
img = Image.fromarray(img)
96 |
97 |
if self.transform:
98 |
img = self.transform(img)
99 |
if self.target_transform:
100 |
label = self.target_transform(label)
101 |
# Adding 0 padding to make it 32x32, as the model expects this.
102 |
103 |
# img = img.unsqueeze(0) # Add channel dimension, as model expects this.
104 |
return img, label # Return as float32, and label as int., should solve issue.
105 |
106 |
107 |
# Make the LeNet-5 model
108 |
class LeNet5Model(nn.Module):
109 |
def __init__(self):
110 |
111 |
# Define activation, and sequential layers, then make forward pass.
112 |
self.tanh = nn.Tanh()
113 |
# Convolutional layers, https://pytorch.org/docs/stable/generated/torch.nn.Conv2d.html
114 |
# Avg Pooling, https://pytorch.org/docs/stable/generated/torch.nn.AvgPool2d.html
115 |
self.le_stack = nn.Sequential(
116 |
nn.Conv2d(in_channels=1, out_channels=6, kernel_size=5, stride=1),
117 |
118 |
nn.AvgPool2d(kernel_size=2, stride=2),
119 |
nn.Conv2d(in_channels=6, out_channels=16, kernel_size=5, stride=1),
120 |
121 |
nn.AvgPool2d(kernel_size=2, stride=2),
122 |
nn.Conv2d(in_channels=16, out_channels=120, kernel_size=5, stride=1),
123 |
124 |
125 |
# Fully connected layers, https://pytorch.org/docs/stable/generated/torch.nn.Linear.html
126 |
self.fc_stack = nn.Sequential(
127 |
nn.Linear(in_features=120, out_features=84),
128 |
129 |
nn.Linear(in_features=84, out_features=10)
130 |
131 |
132 |
def forward(self, x):
133 |
"""Forward pass of the model."""
134 |
x = self.le_stack(x)
135 |
x = x.reshape(x.shape[0], -1)
136 |
x = self.fc_stack(x)
137 |
return x
138 |
139 |
140 |
def train_model(model, train_loader, test_loader, epochs=10, learning_rate=0.001, saved_model=None):
141 |
142 |
Given a model, train the model using the train_loader and test_loader, and show metrics,
143 |
saving the best model parameters currently.
144 |
145 |
# When we have model, we need the loss function and optimizer we will use.
146 |
# Loss function, https://pytorch.org/docs/stable/generated/torch.nn.CrossEntropyLoss.html
147 |
loss_fn = nn.CrossEntropyLoss() # because we calculating probabilities and this is a classification problem.
148 |
# Optimizer, https://pytorch.org/docs/stable/optim.html
149 |
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=1e-6) # learning rate of 0.001
150 |
best_accuracy = 0.0
151 |
# See if best accuracy is saved, if so, get current best accuracy.
152 |
if os.path.exists("best_model.txt"):
153 |
with open("best_model.txt", "r") as file:
154 |
best_accuracy = float(file.read())
155 |
156 |
if saved_model is not None: # Load the model parameters if they exist.
157 |
158 |
159 |
# Training loop
160 |
for i in range(epochs):
161 |
162 |
print("Epoch ", i)
163 |
for batch, (x, y) in enumerate(train_loader):
164 |
165 |
x, y = x.to(device), y.to(device)
166 |
# Forward pass
167 |
168 |
# print(x.shape, y.shape)
169 |
# Shape of x is [64, 28, 28] and y is [64,]
170 |
# But x needs to include the channels, so shape should be [64, 1, 28, 28]
171 |
# x = x.view(-1, 1, 32, 32)
172 |
173 |
y_pred = model(x)
174 |
# Compute loss
175 |
loss = loss_fn(y_pred, y)
176 |
# Zero gradients, backward pass, and update weights
177 |
178 |
179 |
180 |
# Print loss
181 |
if batch % 250 == 0:
182 |
print(f"Epoch {i} batch {batch} loss: {loss.item()}")
183 |
# Evaluate the model
184 |
185 |
correct, total = 0, 0
186 |
with torch.no_grad():
187 |
for x, y in test_loader:
188 |
x, y = x.to(device), y.to(device)
189 |
#x = x.view(-1, 1, 32, 32)
190 |
y_pred = model(x)
191 |
_, predicted = torch.max(y_pred, 1)
192 |
total += y.size(0)
193 |
correct += (predicted == y).sum().item()
194 |
print(f"Epoch {i} accuracy: {correct/total}")
195 |
if correct/total > best_accuracy:
196 |
best_accuracy = correct/total
197 |
torch.save(model.state_dict(), "lenet_mnist_model.pth")
198 |
with open("best_model.txt", "w") as file:
199 |
200 |
print("Training complete.")
201 |
202 |
203 |
def init_weights(m):
204 |
if isinstance(m, nn.Conv2d):
205 |
206 |
if m.bias is not None:
207 |
208 |
elif isinstance(m, nn.Linear):
209 |
210 |
211 |
212 |
if __name__ == "__main__":
213 |
# Testing conversion from ubyte idx to numpy array
214 |
215 |
# file_name = "t10k-images.idx3-ubyte"
216 |
# label_file = "t10k-labels.idx1-ubyte"
217 |
# file_path = os.path.join("mnist_dataset", label_file)
218 |
# image_array = idx2numpy.convert_from_file(file_path)
219 |
# print(image_array.shape) # (10000, 28, 28) # 10000 images of 28x28 pixels
220 |
221 |
222 |
test_data = CustomImageDataset("mnist_dataset/t10k-labels.idx1-ubyte", "mnist_dataset/t10k-images.idx3-ubyte", transform=test_transform)
223 |
print((test_data[0])[0].shape, "label value", test_data[0][1]) # Getting image from dataset.
224 |
train_data = CustomImageDataset("mnist_dataset/train-labels.idx1-ubyte", "mnist_dataset/train-images.idx3-ubyte", transform=train_transform)
225 |
226 |
# Create a DataLoader, so we can iterate through the dataset in batches.
227 |
test_loader = DataLoader(test_data, batch_size=64, shuffle=True)
228 |
train_loader = DataLoader(train_data, batch_size=64, shuffle=True)
229 |
230 |
# print(f"Output shape of train function, ", next(iter(test_loader))[0].shape) # [ 64x28x28 ] [64,] Image and labels.
231 |
232 |
# Display image and label. - From docs.
233 |
# train_features, train_labels = next(iter(train_loader))
234 |
# print(f"Feature batch shape: {train_features.size()}")
235 |
# print(f"Labels batch shape: {train_labels.size()}")
236 |
# img = train_features[0].squeeze()
237 |
# label = train_labels[0]
238 |
# plt.imshow(img, cmap="gray")
239 |
# plt.show()
240 |
# print(f"Label: {label}")
241 |
242 |
model = LeNet5Model().to(device)
243 |
model.apply(init_weights) # Apply Xavier initialisation to the model.
244 |
245 |
246 |
247 |
# Training the model
248 |
train_model(model, train_loader, test_loader, epochs=1000, learning_rate=0.001)
249 |
# Save the model parameters
250 |
torch.save(model.state_dict(), "lenet_mnist_model.pth")
251 |
252 |
# Current errors include:
253 |
# - RuntimeError: Input type (unsigned char) and bias type (float) should be the same
254 |
# - I solved this by converting the image from customer loader to float32 values.
255 |
# - RuntimeError: Calculated padded input size per channel: (4 x 4). Kernel size: (5 x 5). Kernel size can't be greater than actual input size
256 |
# - I solved this by adding padding to make it 32x32 as the model expect this and dataset is 28x28.
257 |
# - The model also had problems when evaluating, it is important dims are batch x channels x height x width, and labels are int.
258 |
259 |
# Ways to improve accuracy:
260 |
# We will try to normalise the dataset via z-score, so values which are brighter are not given more importance. [98.99% accuracy]
261 |
# We can apply rotations and affine to potentially improve the model by making it learn more abstractly from specific patterns rather than exact same orientation.
262 |
# Xavier intialisation of CNN and FC layers, to prevent vanishing gradients.
263 |
# Increase the angle of rotation and affine transformations to see if it improves the model.
264 |
# We could potentally help the model by applying a enhancement filter (negative laplacian) from computer vision, to the image, inverse laplacian
265 |
266 |
# We do not know whether model is overfitting, as we do not have a graph of the training and validation loss.
@@ -0,0 +1,3 @@
1 |
version https://git-lfs.github.com/spec/v1
2 |
oid sha256:05ff80605ac574e7e667ec532c8c4b94845e2b11c0c69c06feccd7d86dbab95f
3 |
size 250431
![]() |
@@ -0,0 +1,3 @@
1 |
version https://git-lfs.github.com/spec/v1
2 |
oid sha256:0fa7898d509279e482958e8ce81c8e77db3f2f8254e26661ceb7762c4d494ce7
3 |
size 7840016
Binary file (10 kB). View file
@@ -0,0 +1,3 @@
1 |
version https://git-lfs.github.com/spec/v1
2 |
oid sha256:ba891046e6505d7aadcbbe25680a0738ad16aec93bde7f9b65e87a2fc25776db
3 |
size 47040016
Binary file (60 kB). View file
@@ -0,0 +1,62 @@
1 |
import idx2numpy, torch
2 |
import torch
3 |
import torch.nn as nn
4 |
import torch.nn.functional as F
5 |
from torchvision import transforms, datasets
6 |
from PIL import Image
7 |
8 |
9 |
class ApplyEnhancementFilter:
10 |
def __init__(self, out_channels, kernel_size, stride=1, padding=0, bias=False):
11 |
12 |
Initialize the convolution parameters.
13 |
14 |
self.out_channels = out_channels
15 |
self.kernel_size = kernel_size
16 |
self.stride = stride
17 |
self.padding = padding
18 |
self.bias = bias
19 |
# Define the convolutional layer (not trained here)
20 |
self.conv = nn.Conv2d(in_channels=1, # Adjust this based on your image channels (1 for grayscale, 3 for RGB)
21 |
22 |
23 |
24 |
25 |
26 |
27 |
# Example: Manually defining a simple edge-detection kernel
28 |
# For a real use-case, the kernel weights would be learned or defined according to the filter you need.
29 |
edge_detection_kernel = torch.tensor([[0, -1., 0.],
30 |
[-1., 5., -1.],
31 |
[0., -1., 0.]]).unsqueeze(0).unsqueeze(0)
32 |
self.conv.weight = nn.Parameter(edge_detection_kernel.float())
33 |
34 |
def __call__(self, img):
35 |
36 |
Apply the convolution transformation.
37 |
38 |
# Convert PIL image to tensor
39 |
img_tensor = transforms.functional.to_tensor(img).unsqueeze(0) # Add batch dimension
40 |
# Apply convolution
41 |
conv_img = self.conv(img_tensor)
42 |
# Remove batch dimension and convert back to PIL image for further transformations or visualization
43 |
conv_img_pil = transforms.functional.to_pil_image(conv_img.squeeze(0))
44 |
return conv_img_pil
45 |
46 |
47 |
if __name__ == "__main__":
48 |
# It is important to normalise the dataset, so no specific input effects the model more than other based purely on input values.
49 |
# As values can range from 0-255, this can cause problems, so z-score will be used via Transforms.
50 |
51 |
# First we need the mean and standard deviation of train dataset.
52 |
53 |
train_images = idx2numpy.convert_from_file("mnist_dataset/train-images.idx3-ubyte")
54 |
55 |
# Convert the training images to a PyTorch tensor and scale values to [0, 1]
56 |
train_images_tensor = torch.tensor(train_images, dtype=torch.float32) / 255.0
57 |
58 |
train_mean = train_images_tensor.mean()
59 |
train_std = train_images_tensor.std()
60 |
61 |
print(f"Mean: {train_mean}, Std: {train_std}")
62 |
# Mean: 0.13066047430038452, Std: 0.30810782313346863