Kevin Fink commited on
Commit
f06d0fa
·
1 Parent(s): 744bfc7
Files changed (1) hide show
  1. app.py +13 -11
app.py CHANGED
@@ -55,6 +55,8 @@ def fine_tune_model(model, dataset_name, hub_id, api_key, num_epochs, batch_size
55
 
56
  # Set training arguments
57
  training_args = TrainingArguments(
 
 
58
  output_dir='/data/results',
59
  eval_strategy="steps", # Change this to steps
60
  save_strategy='steps',
@@ -69,18 +71,18 @@ def fine_tune_model(model, dataset_name, hub_id, api_key, num_epochs, batch_size
69
  metric_for_best_model="accuracy",
70
  greater_is_better=True,
71
  logging_dir='/data/logs',
72
- logging_steps=10,
73
  #push_to_hub=True,
74
  hub_model_id=hub_id.strip(),
75
  fp16=True,
76
  #lr_scheduler_type='cosine',
77
- save_steps=500, # Save checkpoint every 500 steps
78
  save_total_limit=3,
79
  )
80
  # Check if a checkpoint exists and load it
81
- #if os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir):
82
- #print("Loading model from checkpoint...")
83
- #model = AutoModelForSeq2SeqLM.from_pretrained(training_args.output_dir)
84
 
85
  tokenizer = AutoTokenizer.from_pretrained('google/t5-efficient-tiny-nh8')
86
 
@@ -136,7 +138,7 @@ def fine_tune_model(model, dataset_name, hub_id, api_key, num_epochs, batch_size
136
  dataset['train'] = dataset['train'].select(range(8000))
137
  del dataset['train']
138
  del dataset['validation']
139
- test_set = dataset.map(tokenize_function, batched=True)
140
  test_set['test'].save_to_disk(f'/data/{hub_id.strip()}_test_dataset')
141
  return 'TRAINING DONE'
142
 
@@ -152,7 +154,7 @@ def fine_tune_model(model, dataset_name, hub_id, api_key, num_epochs, batch_size
152
  third_third = dataset['train'].select(range(third_size*2, train_size))
153
  dataset['train'] = third_third
154
  #tokenized_second_half = tokenize_function(third_third)
155
- tokenized_second_half = dataset.map(tokenize_function, batched=True)
156
  dataset['train'] = concatenate_datasets([saved_dataset, tokenized_second_half['train']])
157
  dataset['train'].save_to_disk(f'/data/{hub_id.strip()}_train_dataset3')
158
  return 'THIRD THIRD LOADED'
@@ -167,7 +169,7 @@ def fine_tune_model(model, dataset_name, hub_id, api_key, num_epochs, batch_size
167
  second_third = dataset['train'].select(range(third_size, third_size*2))
168
  dataset['train'] = second_third
169
  del dataset['test']
170
- tokenized_sh_fq_dataset = dataset.map(tokenize_function, batched=True)
171
  dataset['train'] = concatenate_datasets([saved_dataset['train'], tokenized_sh_fq_dataset['train']])
172
  dataset['train'].save_to_disk(f'/data/{hub_id.strip()}_train_dataset2')
173
  dataset['validation'].save_to_disk(f'/data/{hub_id.strip()}_validation_dataset')
@@ -184,15 +186,15 @@ def fine_tune_model(model, dataset_name, hub_id, api_key, num_epochs, batch_size
184
  dataset['train'] = first_third
185
  del dataset['test']
186
  del dataset['validation']
187
- tokenized_first_third = dataset.map(tokenize_function, batched=True)
188
 
189
  tokenized_first_third.save_to_disk(f'/data/{hub_id.strip()}_train_dataset')
190
  print('DONE')
191
  return 'RUN AGAIN TO LOAD REST OF DATA'
192
 
193
  # Fine-tune the model
194
- #if os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir):
195
- #train_result = trainer.train(resume_from_checkpoint=True)
196
  else:
197
  train_result = trainer.train()
198
  trainer.push_to_hub(commit_message="Training complete!")
 
55
 
56
  # Set training arguments
57
  training_args = TrainingArguments(
58
+ torch_empty_cache_steps=150,
59
+ overwrite_output_dir=True,
60
  output_dir='/data/results',
61
  eval_strategy="steps", # Change this to steps
62
  save_strategy='steps',
 
71
  metric_for_best_model="accuracy",
72
  greater_is_better=True,
73
  logging_dir='/data/logs',
74
+ logging_steps=250,
75
  #push_to_hub=True,
76
  hub_model_id=hub_id.strip(),
77
  fp16=True,
78
  #lr_scheduler_type='cosine',
79
+ save_steps=350, # Save checkpoint every 500 steps
80
  save_total_limit=3,
81
  )
82
  # Check if a checkpoint exists and load it
83
+ if os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir):
84
+ print("Loading model from checkpoint...")
85
+ model = AutoModelForSeq2SeqLM.from_pretrained(training_args.output_dir)
86
 
87
  tokenizer = AutoTokenizer.from_pretrained('google/t5-efficient-tiny-nh8')
88
 
 
138
  dataset['train'] = dataset['train'].select(range(8000))
139
  del dataset['train']
140
  del dataset['validation']
141
+ test_set = dataset.map(tokenize_function, batched=True, batch_size=20)
142
  test_set['test'].save_to_disk(f'/data/{hub_id.strip()}_test_dataset')
143
  return 'TRAINING DONE'
144
 
 
154
  third_third = dataset['train'].select(range(third_size*2, train_size))
155
  dataset['train'] = third_third
156
  #tokenized_second_half = tokenize_function(third_third)
157
+ tokenized_second_half = dataset.map(tokenize_function, batched=True, batch_size=20)
158
  dataset['train'] = concatenate_datasets([saved_dataset, tokenized_second_half['train']])
159
  dataset['train'].save_to_disk(f'/data/{hub_id.strip()}_train_dataset3')
160
  return 'THIRD THIRD LOADED'
 
169
  second_third = dataset['train'].select(range(third_size, third_size*2))
170
  dataset['train'] = second_third
171
  del dataset['test']
172
+ tokenized_sh_fq_dataset = dataset.map(tokenize_function, batched=True, batch_size=20)
173
  dataset['train'] = concatenate_datasets([saved_dataset['train'], tokenized_sh_fq_dataset['train']])
174
  dataset['train'].save_to_disk(f'/data/{hub_id.strip()}_train_dataset2')
175
  dataset['validation'].save_to_disk(f'/data/{hub_id.strip()}_validation_dataset')
 
186
  dataset['train'] = first_third
187
  del dataset['test']
188
  del dataset['validation']
189
+ tokenized_first_third = dataset.map(tokenize_function, batched=True, batch_size=20)
190
 
191
  tokenized_first_third.save_to_disk(f'/data/{hub_id.strip()}_train_dataset')
192
  print('DONE')
193
  return 'RUN AGAIN TO LOAD REST OF DATA'
194
 
195
  # Fine-tune the model
196
+ if os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir):
197
+ train_result = trainer.train(resume_from_checkpoint=True)
198
  else:
199
  train_result = trainer.train()
200
  trainer.push_to_hub(commit_message="Training complete!")