aaa / LLMatic-main /train.py
Ahmed-Salah's picture
Upload 57 files
d625688 verified
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
import multiprocessing as mp
from conf.config import Config
from datasets import get_datasets
import ray
import time
cfg = Config()
@ray.remote(num_gpus=cfg.NUM_GPUS)
def train_net_on_gpu(net, epochs=1):
trainset, trainloader, validset, validloader = get_datasets()
dev = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(f"The network will train on {dev} device")
criterion = nn.CrossEntropyLoss()
#breakpoint()
optimizer = optim.SGD(net.to(dev).parameters(), lr=0.001, momentum=0.9)
print("------TRAINING------")
#net.train()
for epoch in range(epochs): # loop over the dataset multiple times
running_loss = 0.0
correct = 0.0
for i, data in enumerate(trainloader):
# get the inputs; data is a list of [inputs, labels]
inputs, labels = data
inputs, labels = inputs.to(dev), labels.to(dev)
# zero the parameter gradients
optimizer.zero_grad()
outputs = net(inputs).to(dev)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
# print statistics
running_loss += loss.item()
correct += (outputs == labels).float().sum()
training_accuracy = 100 * correct / len(trainset)
valid_loss = 0.0
valid_correct = 0.0
net.eval() # Optional when not using Model Specific layer
for i, d in enumerate(validloader):
# Transfer Data to GPU if available
data, labels = d
data, labels = data.to(dev), labels.to(dev)
target = net(data).to(dev)
# Find the Loss
loss = criterion(target,labels)
# Calculate Loss
valid_loss += loss.item()
valid_correct += (target == labels).float().sum()
valid_accuracy = 100 * valid_correct / len(validset)
print(f'Training Epochs: {epoch}\t\t Loss: {running_loss}\t\t Train Accuracy: {training_accuracy}')
print(f'\t\t Valid Loss: {valid_loss}\t\t Valid Accuracy: {valid_accuracy}')
return valid_loss
@ray.remote(num_cpus=cfg.NUM_CPUS)
def train_net_on_cpu(net, epochs=1):
trainset, trainloader, validset, validloader = get_datasets()
dev = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(f"The network will train on {dev} device")
criterion = nn.CrossEntropyLoss()
#breakpoint()
optimizer = optim.SGD(net.to(dev).parameters(), lr=0.001, momentum=0.9)
print("------TRAINING------")
#net.train()
for epoch in range(epochs): # loop over the dataset multiple times
running_loss = 0.0
correct = 0.0
training_loss = 0.0
valid_loss = 0.0
valid_correct = 0.0
valid_running_loss = 0.0
total = 0.0
val_total = 0.0
for i, data in enumerate(trainloader):
# get the inputs; data is a list of [inputs, labels]
inputs, labels = data
inputs, labels = inputs.to(dev), labels.to(dev)
# zero the parameter gradients
optimizer.zero_grad()
outputs = net(inputs).to(dev)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
# print statistics
running_loss += loss.item()
training_loss += loss.item()
total += labels.size(0)
correct += (outputs == labels).float().sum()
if i % 2000 == 1999:
#print('[%d, %5d] loss: %.3f, accuracy: %.3f' %
# (epoch + 1, i + 1, running_loss / 2000, correct / total))
running_loss = 0.0
training_accuracy = 100 * correct / total
net.eval() # Optional when not using Model Specific layer
for i, d in enumerate(validloader):
# Transfer Data to GPU if available
data, labels = d
data, labels = data.to(dev), labels.to(dev)
target = net(data).to(dev)
# Find the Loss
loss = criterion(target,labels)
# Calculate Loss
valid_running_loss += loss.item()
valid_loss += loss.item()
val_total += labels.size(0)
valid_correct += (target == labels).float().sum()
if i % 2000 == 1999:
#print('[%d, %5d] loss: %.3f, accuracy: %.3f' %
# (epoch + 1, i + 1, running_loss / 2000, valid_correct / val_total))
valid_running_loss = 0.0
valid_accuracy = 100 * valid_correct / val_total
print(f'Training Epochs: {epoch}\t\t Training Loss: {training_loss / len(trainloader)}\t\t Train Accuracy: {training_accuracy}')
print(f'\t\t Valid Loss: {valid_loss / len(validloader)}\t\t Valid Accuracy: {valid_accuracy}')
return valid_loss / len(validloader)
@ray.remote(num_cpus=cfg.NUM_CPUS)
def forward_pass_on_cpu(net):
trainset, trainloader, validset, validloader = get_datasets()
# Define model architecture
dev = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(f"The network will train on {dev} device")
print("------Forward passing------")
# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
#breakpoint()
optimizer = optim.SGD(net.to(dev).parameters(), lr=0.001, momentum=0.9)
batches = 0
running_loss = 0
correct = 0.0
start = time.time()
# Perform forward pass
for batch_idx, (data, target) in enumerate(trainloader):
# Set the gradients to zero
optimizer.zero_grad()
# Forward pass
output = net(data).to(dev)
# Compute loss
loss = criterion(output, target)
running_loss += loss.item()
correct += (target == output).float().sum()
#if batches == 2:
# break
#batches +=1
# Print loss every 1000 batches
#if batch_idx % 1000 == 0:
# print('Batch Index : {} Loss : {}'.format(batch_idx, running_loss))
inference_time = time.time() - start
accuracy = 100 * correct / len(trainset)
print(f'\t\t Loss: {running_loss/len(validloader)}\t\t Accuracy: {accuracy}\t\t Inference Time: {inference_time}')
return running_loss / len(validloader), inference_time
@ray.remote(num_gpus=cfg.NUM_GPUS)
def forward_pass_on_gpu(net):
trainset, trainloader, validset, validloader = get_datasets()
# Define model architecture
dev = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(f"The network will train on {dev} device")
print("------Forward passing------")
# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
#breakpoint()
optimizer = optim.SGD(net.to(dev).parameters(), lr=0.001, momentum=0.9)
batches = 0
running_loss = 0
correct = 0.0
start = time.time()
# Perform forward pass
for batch_idx, (data, target) in enumerate(trainloader):
# Set the gradients to zero
optimizer.zero_grad()
# Forward pass
output = net(data).to(dev)
# Compute loss
loss = criterion(output, target)
running_loss += loss.item()
correct += (target == output).float().sum()
#if batches == 2:
# break
#batches +=1
# Print loss every 1000 batches
#if batch_idx % 1000 == 0:
# print('Batch Index : {} Loss : {}'.format(batch_idx, running_loss))
inference_time = time.time() - start
accuracy = 100 * correct / len(trainset)
print(f'\t\t Loss: {running_loss/len(validloader)}\t\t Accuracy: {accuracy}\t\t Inference Time: {inference_time}')
return running_loss / len(validloader), inference_time
def multiprocess_training(training_func, num_processes, *args, **kwargs):
# Define a function to apply the training function to a set of arguments.
#def process_training(args):
# return training_func(*args)
# Define a multiprocessing pool with the desired number of processes.
pool = mp.Pool(num_processes)
# Apply the PyTorch training function to each set of arguments using the multiprocessing pool.
results = pool.map(training_func, zip(*args))
# Close the multiprocessing pool and join the processes.
pool.close()
pool.join()
return results
def transfer_weights(netA, netB, layer_type):
# Get the parameters of the two networks
paramsA = netA.state_dict()
paramsB = netB.state_dict()
# Transfer the weights from the layers of the given type in netA to the corresponding layers in netB
for name, module in netA.named_modules():
if type(module).__name__ == layer_type:
#print(f"paramsB[name + '.weight'].shape: {len(paramsB[name + '.bias'].shape)}")
if len(paramsB[name + '.bias'].shape) == 5:
paramsB[name + '.weight'][:, :, :, :, :] = paramsA[name + '.weight'][:, :, :, :, :]
elif len(paramsB[name + '.bias'].shape) == 4:
paramsB[name + '.weight'][:, :, :, :] = paramsA[name + '.weight'][:, :, :, :]
elif len(paramsB[name + '.bias'].shape) == 3:
paramsB[name + '.weight'][:, :, :] = paramsA[name + '.weight'][:, :, :]
elif len(paramsB[name + '.bias'].shape) == 2:
paramsB[name + '.weight'][:, :] = paramsA[name + '.weight'][:, :]
elif len(paramsB[name + '.bias'].shape) == 1:
paramsB[name + '.weight'][:] = paramsA[name + '.weight'][:]
paramsB[name + '.bias'][:] = paramsA[name + '.bias'][:]
# Set the state dict of netB to the updated parameters
netB.load_state_dict(paramsB)
return netB
def detect_layers(model):
layers = []
def detect_layers_recursively(module):
for child in module.children():
if isinstance(child, nn.Sequential):
detect_layers_recursively(child)
elif isinstance(child, nn.Module):
layers.append(child.__class__.__name__)
detect_layers_recursively(child)
detect_layers_recursively(model)
return list(set(layers))