Spaces:
Sleeping
Sleeping
Sreekanth Tangirala
commited on
Commit
·
1fecae5
1
Parent(s):
3518e5d
adding augmentation and different datasets for test and train
Browse files
train.py
CHANGED
@@ -6,26 +6,40 @@ import torchvision.transforms as transforms
|
|
6 |
from torch.utils.data import DataLoader, Subset
|
7 |
from model import get_model, save_model
|
8 |
from tqdm import tqdm
|
|
|
|
|
9 |
|
10 |
def get_transforms():
|
11 |
"""
|
12 |
-
Define the image transformations
|
13 |
"""
|
14 |
-
|
15 |
transforms.Resize(224),
|
|
|
|
|
|
|
|
|
16 |
transforms.ToTensor(),
|
17 |
transforms.Normalize(mean=[0.485, 0.456, 0.406],
|
18 |
std=[0.229, 0.224, 0.225])
|
19 |
])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
|
21 |
def get_data(subset_size=None, train=True):
|
22 |
"""
|
23 |
Load and prepare the dataset
|
24 |
-
Args:
|
25 |
-
subset_size (int): If provided, return only a subset of data
|
26 |
-
train (bool): If True, return training data, else test data
|
27 |
"""
|
28 |
-
|
|
|
|
|
29 |
dataset = torchvision.datasets.CIFAR10(
|
30 |
root='./data',
|
31 |
train=train,
|
@@ -66,26 +80,34 @@ def evaluate_model(model, testloader, device):
|
|
66 |
|
67 |
def train_model(model, trainloader, testloader, epochs=100, device='cuda'):
|
68 |
"""
|
69 |
-
Train the model
|
70 |
-
Args:
|
71 |
-
model: The ResNet50 model
|
72 |
-
trainloader: DataLoader for training data
|
73 |
-
testloader: DataLoader for test data
|
74 |
-
epochs (int): Number of epochs to train
|
75 |
-
device (str): Device to train on ('cuda' or 'cpu')
|
76 |
"""
|
77 |
model = model.to(device)
|
78 |
criterion = nn.CrossEntropyLoss()
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
84 |
)
|
85 |
|
86 |
-
|
|
|
|
|
|
|
87 |
|
88 |
-
|
|
|
|
|
|
|
|
|
|
|
89 |
epoch_pbar = tqdm(range(epochs), desc='Training Progress', position=0)
|
90 |
|
91 |
for epoch in epoch_pbar:
|
@@ -108,6 +130,7 @@ def train_model(model, trainloader, testloader, epochs=100, device='cuda'):
|
|
108 |
loss = criterion(outputs, labels)
|
109 |
loss.backward()
|
110 |
optimizer.step()
|
|
|
111 |
|
112 |
running_loss += loss.item()
|
113 |
_, predicted = outputs.max(1)
|
@@ -124,25 +147,39 @@ def train_model(model, trainloader, testloader, epochs=100, device='cuda'):
|
|
124 |
test_acc = evaluate_model(model, testloader, device)
|
125 |
epoch_pbar.write(f'Epoch {epoch+1}: Train Loss: {avg_loss:.3f} | Train Acc: {epoch_acc:.2f}% | Test Acc: {test_acc:.2f}%')
|
126 |
|
127 |
-
|
128 |
-
|
|
|
|
|
129 |
if test_acc > best_acc:
|
130 |
best_acc = test_acc
|
131 |
save_model(model, 'best_model.pth')
|
132 |
epoch_pbar.write(f'New best test accuracy: {test_acc:.2f}%')
|
|
|
|
|
|
|
133 |
|
134 |
if test_acc > 70:
|
135 |
epoch_pbar.write(f"\nReached target accuracy of 70% on test data!")
|
|
|
|
|
136 |
break
|
137 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
138 |
if __name__ == "__main__":
|
139 |
# Set device
|
140 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
141 |
print(f"Using device: {device}")
|
142 |
|
143 |
-
# Get train and test data
|
144 |
-
trainloader = get_data(subset_size=
|
145 |
-
testloader = get_data(subset_size=
|
146 |
|
147 |
# Initialize model
|
148 |
model = get_model(num_classes=10)
|
|
|
6 |
from torch.utils.data import DataLoader, Subset
|
7 |
from model import get_model, save_model
|
8 |
from tqdm import tqdm
|
9 |
+
import os
|
10 |
+
from datetime import datetime
|
11 |
|
12 |
def get_transforms():
|
13 |
"""
|
14 |
+
Define the image transformations with augmentation for training
|
15 |
"""
|
16 |
+
train_transform = transforms.Compose([
|
17 |
transforms.Resize(224),
|
18 |
+
transforms.RandomHorizontalFlip(),
|
19 |
+
transforms.RandomRotation(15),
|
20 |
+
transforms.RandomAffine(degrees=0, translate=(0.1, 0.1)),
|
21 |
+
transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2),
|
22 |
transforms.ToTensor(),
|
23 |
transforms.Normalize(mean=[0.485, 0.456, 0.406],
|
24 |
std=[0.229, 0.224, 0.225])
|
25 |
])
|
26 |
+
|
27 |
+
test_transform = transforms.Compose([
|
28 |
+
transforms.Resize(224),
|
29 |
+
transforms.ToTensor(),
|
30 |
+
transforms.Normalize(mean=[0.485, 0.456, 0.406],
|
31 |
+
std=[0.229, 0.224, 0.225])
|
32 |
+
])
|
33 |
+
|
34 |
+
return train_transform, test_transform
|
35 |
|
36 |
def get_data(subset_size=None, train=True):
|
37 |
"""
|
38 |
Load and prepare the dataset
|
|
|
|
|
|
|
39 |
"""
|
40 |
+
train_transform, test_transform = get_transforms()
|
41 |
+
transform = train_transform if train else test_transform
|
42 |
+
|
43 |
dataset = torchvision.datasets.CIFAR10(
|
44 |
root='./data',
|
45 |
train=train,
|
|
|
80 |
|
81 |
def train_model(model, trainloader, testloader, epochs=100, device='cuda'):
|
82 |
"""
|
83 |
+
Train the model with improved hyperparameters and markdown logging
|
|
|
|
|
|
|
|
|
|
|
|
|
84 |
"""
|
85 |
model = model.to(device)
|
86 |
criterion = nn.CrossEntropyLoss()
|
87 |
+
|
88 |
+
# Add weight decay and reduce initial learning rate
|
89 |
+
optimizer = optim.AdamW(model.parameters(), lr=0.0001, weight_decay=0.01)
|
90 |
+
|
91 |
+
# Modify scheduler for better learning rate adjustment
|
92 |
+
scheduler = optim.lr_scheduler.OneCycleLR(
|
93 |
+
optimizer,
|
94 |
+
max_lr=0.001,
|
95 |
+
epochs=epochs,
|
96 |
+
steps_per_epoch=len(trainloader),
|
97 |
+
pct_start=0.2 # Warm up for first 20% of training
|
98 |
)
|
99 |
|
100 |
+
# Create a markdown file for logging
|
101 |
+
log_dir = 'logs'
|
102 |
+
os.makedirs(log_dir, exist_ok=True)
|
103 |
+
log_file = os.path.join(log_dir, f'training_log_{datetime.now().strftime("%Y%m%d_%H%M%S")}.md')
|
104 |
|
105 |
+
with open(log_file, 'w') as f:
|
106 |
+
f.write("# Training Log\n\n")
|
107 |
+
f.write("| Epoch | Train Loss | Train Acc | Test Acc | Best Acc |\n")
|
108 |
+
f.write("|-------|------------|-----------|-----------|----------|\n")
|
109 |
+
|
110 |
+
best_acc = 0.0
|
111 |
epoch_pbar = tqdm(range(epochs), desc='Training Progress', position=0)
|
112 |
|
113 |
for epoch in epoch_pbar:
|
|
|
130 |
loss = criterion(outputs, labels)
|
131 |
loss.backward()
|
132 |
optimizer.step()
|
133 |
+
scheduler.step() # Step the scheduler every batch
|
134 |
|
135 |
running_loss += loss.item()
|
136 |
_, predicted = outputs.max(1)
|
|
|
147 |
test_acc = evaluate_model(model, testloader, device)
|
148 |
epoch_pbar.write(f'Epoch {epoch+1}: Train Loss: {avg_loss:.3f} | Train Acc: {epoch_acc:.2f}% | Test Acc: {test_acc:.2f}%')
|
149 |
|
150 |
+
# After computing metrics, log to markdown file
|
151 |
+
with open(log_file, 'a') as f:
|
152 |
+
f.write(f"| {epoch+1:5d} | {avg_loss:.3f} | {epoch_acc:.2f}% | {test_acc:.2f}% | {best_acc:.2f}% |\n")
|
153 |
+
|
154 |
if test_acc > best_acc:
|
155 |
best_acc = test_acc
|
156 |
save_model(model, 'best_model.pth')
|
157 |
epoch_pbar.write(f'New best test accuracy: {test_acc:.2f}%')
|
158 |
+
# Add a marker for best accuracy in the markdown
|
159 |
+
with open(log_file, 'a') as f:
|
160 |
+
f.write(f"**New best accuracy achieved at epoch {epoch+1}**\n\n")
|
161 |
|
162 |
if test_acc > 70:
|
163 |
epoch_pbar.write(f"\nReached target accuracy of 70% on test data!")
|
164 |
+
with open(log_file, 'a') as f:
|
165 |
+
f.write(f"\n**Training stopped at epoch {epoch+1} after reaching target accuracy of 70%**\n")
|
166 |
break
|
167 |
|
168 |
+
# Add final summary to markdown
|
169 |
+
with open(log_file, 'a') as f:
|
170 |
+
f.write(f"\n## Training Summary\n")
|
171 |
+
f.write(f"- Final Test Accuracy: {test_acc:.2f}%\n")
|
172 |
+
f.write(f"- Best Test Accuracy: {best_acc:.2f}%\n")
|
173 |
+
f.write(f"- Total Epochs: {epoch+1}\n")
|
174 |
+
|
175 |
if __name__ == "__main__":
|
176 |
# Set device
|
177 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
178 |
print(f"Using device: {device}")
|
179 |
|
180 |
+
# Get train and test data with larger batch size
|
181 |
+
trainloader = get_data(subset_size=10000, train=True) # Increased from 5000
|
182 |
+
testloader = get_data(subset_size=2000, train=False) # Increased from 1000
|
183 |
|
184 |
# Initialize model
|
185 |
model = get_model(num_classes=10)
|