amanmibra commited on
Commit
da26829
·
1 Parent(s): cf26dbd

Add model saving

Browse files
Files changed (1) hide show
  1. pipelines/train.py +36 -13
pipelines/train.py CHANGED
@@ -1,5 +1,6 @@
1
  import sys
2
  sys.path.append('..')
 
3
 
4
  # torch
5
  import torch
@@ -32,20 +33,20 @@ stub = Stub(
32
  )
33
 
34
  @stub.function(
35
- gpu=gpu.A100(memory=20),
36
  mounts=[
37
  Mount.from_local_file(local_path='dataset.py'),
38
  Mount.from_local_file(local_path='cnn.py'),
39
  ],
40
- timeout=EPOCHS * 60,
41
- secret=Secret.from_name("wandb")
42
  )
43
  def train(
44
  model,
45
  train_dataloader,
46
  loss_fn,
47
  optimizer,
48
- device="cuda",
49
  epochs=10,
50
  ):
51
  import os
@@ -57,8 +58,12 @@ def train(
57
  print("Begin model training...")
58
  begin = time.time()
59
 
 
 
60
  # set model to cuda
61
- model = model.to(device)
 
 
62
 
63
  # metrics
64
  training_acc = []
@@ -71,7 +76,7 @@ def train(
71
  then = time.time()
72
 
73
  # train model
74
- train_epoch_loss, train_epoch_acc = train_epoch.call(model, train_dataloader, loss_fn, optimizer, device)
75
 
76
  # training metrics
77
  training_loss.append(train_epoch_loss/len(train_dataloader))
@@ -79,18 +84,19 @@ def train(
79
  wandb.log({'training_loss': training_loss[i], 'training_acc': training_acc[i]})
80
 
81
  now = time.time()
82
- print("Training Loss: {:.2f}, Training Accuracy: {:.2f}, Time: {:.2f}s".format(training_loss[i], training_acc[i], now - then))
83
 
84
- print ("-------------------------------------------- \n")
85
 
86
  end = time.time()
87
  wandb.finish()
88
-
89
  print("-------- Finished Training --------")
90
  print("-------- Total Time -- {:.2f}s --------".format(end - begin))
91
 
 
 
92
  @stub.function(
93
- gpu=gpu.A100(memory=20),
94
  mounts=[
95
  Mount.from_local_file(local_path='dataset.py'),
96
  Mount.from_local_file(local_path='cnn.py'),
@@ -124,12 +130,26 @@ def train_epoch(model, train_dataloader, loss_fn, optimizer, device):
124
  train_acc += (prediction == target).sum().item()/len(prediction)
125
  total += 1
126
 
127
- return train_loss, train_acc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
128
 
129
  @stub.local_entrypoint()
130
  def main():
131
  print("Initiating model training...")
132
- device = "cpu"
133
 
134
  # instantiating our dataset object and create data loader
135
  mel_spectrogram = torchaudio.transforms.MelSpectrogram(
@@ -151,5 +171,8 @@ def main():
151
  optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
152
 
153
  # train model
154
- train.call(model, train_dataloader, loss_fn, optimizer, "cuda", EPOCHS)
 
 
 
155
 
 
1
  import sys
2
  sys.path.append('..')
3
+ import time
4
 
5
  # torch
6
  import torch
 
33
  )
34
 
35
  @stub.function(
36
+ gpu="any",
37
  mounts=[
38
  Mount.from_local_file(local_path='dataset.py'),
39
  Mount.from_local_file(local_path='cnn.py'),
40
  ],
41
+ timeout=EPOCHS * 200,
42
+ secret=Secret.from_name("wandb"),
43
  )
44
  def train(
45
  model,
46
  train_dataloader,
47
  loss_fn,
48
  optimizer,
49
+ origin_device="cuda",
50
  epochs=10,
51
  ):
52
  import os
 
58
  print("Begin model training...")
59
  begin = time.time()
60
 
61
+ modal_device = origin_device
62
+
63
  # set model to cuda
64
+ if torch.cuda.is_available() and modal_device != "cuda":
65
+ modal_device = "cuda"
66
+ model = model.to(modal_device)
67
 
68
  # metrics
69
  training_acc = []
 
76
  then = time.time()
77
 
78
  # train model
79
+ model, train_epoch_loss, train_epoch_acc = train_epoch.call(model, train_dataloader, loss_fn, optimizer, modal_device)
80
 
81
  # training metrics
82
  training_loss.append(train_epoch_loss/len(train_dataloader))
 
84
  wandb.log({'training_loss': training_loss[i], 'training_acc': training_acc[i]})
85
 
86
  now = time.time()
87
+ print("Training Loss: {:.2f}, Training Accuracy: {:.4f}, Time: {:.2f}s".format(training_loss[i], training_acc[i], now - then))
88
 
89
+ print ("-------------------------------------------------------- \n")
90
 
91
  end = time.time()
92
  wandb.finish()
 
93
  print("-------- Finished Training --------")
94
  print("-------- Total Time -- {:.2f}s --------".format(end - begin))
95
 
96
+ return model.to(origin_device)
97
+
98
  @stub.function(
99
+ gpu="any",
100
  mounts=[
101
  Mount.from_local_file(local_path='dataset.py'),
102
  Mount.from_local_file(local_path='cnn.py'),
 
130
  train_acc += (prediction == target).sum().item()/len(prediction)
131
  total += 1
132
 
133
+ return model, train_loss, train_acc
134
+
135
+ def save_model(model):
136
+ now = time.strftime("%Y%m%d_%H%M%S")
137
+ model_filename = f"models/void_{now}.pth"
138
+ torch.save(model.state_dict(), model_filename)
139
+ print(f"Trained void model saved at {model_filename}")
140
+
141
+ def get_device():
142
+ if torch.cuda.is_available():
143
+ device = "cuda"
144
+ else:
145
+ device = "cpu"
146
+
147
+ return device
148
 
149
  @stub.local_entrypoint()
150
  def main():
151
  print("Initiating model training...")
152
+ device = get_device()
153
 
154
  # instantiating our dataset object and create data loader
155
  mel_spectrogram = torchaudio.transforms.MelSpectrogram(
 
171
  optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
172
 
173
  # train model
174
+ model = train.call(model, train_dataloader, loss_fn, optimizer, device, 3)
175
+
176
+ # save model
177
+ save_model(model)
178