Vishwas1 commited on
Commit
a2a02fa
·
verified ·
1 Parent(s): 2819aa9

Update train_model.py

Browse files
Files changed (1) hide show
  1. train_model.py +30 -67
train_model.py CHANGED
@@ -16,7 +16,6 @@ import torch
16
  import os
17
  from huggingface_hub import login, HfApi
18
  import logging
19
-
20
  from torch.optim import AdamW # Import PyTorch's AdamW
21
 
22
  def setup_logging(log_file_path):
@@ -64,18 +63,14 @@ def load_and_prepare_dataset(task, dataset_name, tokenizer, sequence_length):
64
  """
65
  logging.info(f"Loading dataset '{dataset_name}' for task '{task}'...")
66
  try:
67
- if '/' in dataset_name:
68
- dataset, config = dataset_name.split('/', 1)
69
- dataset = load_dataset("stanfordnlp/imdb",split='train')
70
- else:
71
- dataset = load_dataset("stanfordnlp/imdb",split='train')
72
-
73
  logging.info("Dataset loaded successfully.")
74
 
75
  def tokenize_function(examples):
76
- return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=sequence_length)
77
-
78
- # Tokenize the dataset
 
79
  tokenized_datasets = dataset.shuffle(seed=42).select(range(500)).map(tokenize_function, batched=True)
80
  logging.info("Dataset tokenization complete.")
81
  return tokenized_datasets
@@ -100,7 +95,6 @@ def initialize_model(task, model_name, vocab_size, sequence_length, hidden_size,
100
  intermediate_size=4 * hidden_size,
101
  hidden_act='gelu',
102
  use_cache=True,
103
- truncation=False
104
  )
105
  model = GPT2LMHeadModel(config)
106
  logging.info("GPT2LMHeadModel initialized successfully.")
@@ -172,31 +166,18 @@ def main():
172
  if tokenizer.pad_token is None:
173
  logging.info("Setting pad_token to eos_token.")
174
  tokenizer.pad_token = tokenizer.eos_token
175
- logging.info(f"Tokenizer pad_token set to: {tokenizer.pad_token}")
176
- # Initialize model after setting pad_token
177
- model = initialize_model(
178
- task=args.task,
179
- model_name=args.model_name,
180
- vocab_size=args.vocab_size,
181
- sequence_length=args.sequence_length,
182
- hidden_size=args.hidden_size,
183
- num_layers=args.num_layers,
184
- attention_heads=args.attention_heads
185
- )
186
- model.resize_token_embeddings(len(tokenizer))
187
- logging.info("Resized token embeddings to accommodate pad_token.")
188
- else:
189
- logging.info(f"Tokenizer already has pad_token set to: {tokenizer.pad_token}")
190
- # Initialize model normally
191
- model = initialize_model(
192
- task=args.task,
193
- model_name=args.model_name,
194
- vocab_size=args.vocab_size,
195
- sequence_length=args.sequence_length,
196
- hidden_size=args.hidden_size,
197
- num_layers=args.num_layers,
198
- attention_heads=args.attention_heads
199
- )
200
  except Exception as e:
201
  logging.error(f"Error initializing tokenizer or model: {str(e)}")
202
  raise e
@@ -223,36 +204,17 @@ def main():
223
  raise ValueError("Unsupported task type for data collator.")
224
 
225
  # Define training arguments
226
- if args.task == "generation":
227
- training_args = TrainingArguments(
228
- output_dir=f"./models/{args.model_name}",
229
- num_train_epochs=3,
230
- per_device_train_batch_size=8,
231
- save_steps=5000,
232
- save_total_limit=2,
233
- logging_steps=500,
234
- learning_rate=5e-4,
235
- remove_unused_columns=False,
236
- push_to_hub=False # We'll handle pushing manually
237
-
238
- )
239
- elif args.task == "classification":
240
- training_args = TrainingArguments(
241
- output_dir=f"./models/{args.model_name}",
242
- num_train_epochs=3,
243
- per_device_train_batch_size=16,
244
- evaluation_strategy="epoch",
245
- save_steps=5000,
246
- save_total_limit=2,
247
- logging_steps=500,
248
- learning_rate=5e-5,
249
- remove_unused_columns=False,
250
- push_to_hub=False # We'll handle pushing manually
251
-
252
- )
253
- else:
254
- logging.error("Unsupported task type for training arguments.")
255
- raise ValueError("Unsupported task type for training arguments.")
256
 
257
  # Initialize Trainer with PyTorch's AdamW optimizer
258
  trainer = Trainer(
@@ -260,7 +222,7 @@ def main():
260
  args=training_args,
261
  train_dataset=tokenized_datasets,
262
  data_collator=data_collator,
263
- optimizers=(get_optimizer(model, training_args.learning_rate), None) # None for scheduler
264
  )
265
 
266
  # Start training
@@ -303,3 +265,4 @@ def main():
303
  if __name__ == "__main__":
304
  main()
305
 
 
 
16
  import os
17
  from huggingface_hub import login, HfApi
18
  import logging
 
19
  from torch.optim import AdamW # Import PyTorch's AdamW
20
 
21
  def setup_logging(log_file_path):
 
63
  """
64
  logging.info(f"Loading dataset '{dataset_name}' for task '{task}'...")
65
  try:
66
+ dataset = load_dataset(dataset_name, split='train')
 
 
 
 
 
67
  logging.info("Dataset loaded successfully.")
68
 
69
  def tokenize_function(examples):
70
+ # Truncate and set max_length, but let DataCollator handle padding
71
+ return tokenizer(examples['text'], truncation=True, max_length=sequence_length)
72
+
73
+ # Tokenize the dataset using the modified tokenize_function
74
  tokenized_datasets = dataset.shuffle(seed=42).select(range(500)).map(tokenize_function, batched=True)
75
  logging.info("Dataset tokenization complete.")
76
  return tokenized_datasets
 
95
  intermediate_size=4 * hidden_size,
96
  hidden_act='gelu',
97
  use_cache=True,
 
98
  )
99
  model = GPT2LMHeadModel(config)
100
  logging.info("GPT2LMHeadModel initialized successfully.")
 
166
  if tokenizer.pad_token is None:
167
  logging.info("Setting pad_token to eos_token.")
168
  tokenizer.pad_token = tokenizer.eos_token
169
+
170
+ # Initialize model
171
+ model = initialize_model(
172
+ task=args.task,
173
+ model_name=args.model_name,
174
+ vocab_size=args.vocab_size,
175
+ sequence_length=args.sequence_length,
176
+ hidden_size=args.hidden_size,
177
+ num_layers=args.num_layers,
178
+ attention_heads=args.attention_heads
179
+ )
180
+ model.resize_token_embeddings(len(tokenizer))
 
 
 
 
 
 
 
 
 
 
 
 
 
181
  except Exception as e:
182
  logging.error(f"Error initializing tokenizer or model: {str(e)}")
183
  raise e
 
204
  raise ValueError("Unsupported task type for data collator.")
205
 
206
  # Define training arguments
207
+ training_args = TrainingArguments(
208
+ output_dir=f"./models/{args.model_name}",
209
+ num_train_epochs=3,
210
+ per_device_train_batch_size=8 if args.task == "generation" else 16,
211
+ save_steps=5000,
212
+ save_total_limit=2,
213
+ logging_steps=500,
214
+ learning_rate=5e-4 if args.task == "generation" else 5e-5,
215
+ remove_unused_columns=False,
216
+ push_to_hub=False
217
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
218
 
219
  # Initialize Trainer with PyTorch's AdamW optimizer
220
  trainer = Trainer(
 
222
  args=training_args,
223
  train_dataset=tokenized_datasets,
224
  data_collator=data_collator,
225
+ optimizers=(get_optimizer(model, training_args.learning_rate), None)
226
  )
227
 
228
  # Start training
 
265
  if __name__ == "__main__":
266
  main()
267
 
268
+