Kevin Fink commited on
Commit
2a42237
·
1 Parent(s): dc0b7f0
Files changed (1) hide show
  1. app.py +6 -6
app.py CHANGED
@@ -84,12 +84,12 @@ def fine_tune_model(model, dataset_name, hub_id, api_key, num_epochs, batch_size
84
  metric_for_best_model="loss",
85
  greater_is_better=True,
86
  logging_dir='/data/logs',
87
- logging_steps=200,
88
  #push_to_hub=True,
89
  hub_model_id=hub_id.strip(),
90
  fp16=True,
91
  #lr_scheduler_type='cosine',
92
- save_steps=200, # Save checkpoint every 500 steps
93
  save_total_limit=3,
94
  )
95
  # Check if a checkpoint exists and load it
@@ -234,10 +234,10 @@ def fine_tune_model(model, dataset_name, hub_id, api_key, num_epochs, batch_size
234
  #)
235
 
236
  # Fine-tune the model
237
- #if os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir):
238
- #train_result = trainer.train(resume_from_checkpoint=True)
239
- #else:
240
- train_result = trainer.train()
241
  trainer.push_to_hub(commit_message="Training complete!")
242
  except Exception as e:
243
  return f"An error occurred: {str(e)}, TB: {traceback.format_exc()}"
 
84
  metric_for_best_model="loss",
85
  greater_is_better=True,
86
  logging_dir='/data/logs',
87
+ logging_steps=210,
88
  #push_to_hub=True,
89
  hub_model_id=hub_id.strip(),
90
  fp16=True,
91
  #lr_scheduler_type='cosine',
92
+ save_steps=210, # Save checkpoint every 500 steps
93
  save_total_limit=3,
94
  )
95
  # Check if a checkpoint exists and load it
 
234
  #)
235
 
236
  # Fine-tune the model
237
+ if os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir):
238
+ train_result = trainer.train(resume_from_checkpoint=True)
239
+ else:
240
+ train_result = trainer.train()
241
  trainer.push_to_hub(commit_message="Training complete!")
242
  except Exception as e:
243
  return f"An error occurred: {str(e)}, TB: {traceback.format_exc()}"