Kevin Fink
commited on
Commit
·
2a42237
1
Parent(s):
dc0b7f0
deve
Browse files
app.py
CHANGED
@@ -84,12 +84,12 @@ def fine_tune_model(model, dataset_name, hub_id, api_key, num_epochs, batch_size
|
|
84 |
metric_for_best_model="loss",
|
85 |
greater_is_better=True,
|
86 |
logging_dir='/data/logs',
|
87 |
-
logging_steps=
|
88 |
#push_to_hub=True,
|
89 |
hub_model_id=hub_id.strip(),
|
90 |
fp16=True,
|
91 |
#lr_scheduler_type='cosine',
|
92 |
-
save_steps=
|
93 |
save_total_limit=3,
|
94 |
)
|
95 |
# Check if a checkpoint exists and load it
|
@@ -234,10 +234,10 @@ def fine_tune_model(model, dataset_name, hub_id, api_key, num_epochs, batch_size
|
|
234 |
#)
|
235 |
|
236 |
# Fine-tune the model
|
237 |
-
|
238 |
-
|
239 |
-
|
240 |
-
|
241 |
trainer.push_to_hub(commit_message="Training complete!")
|
242 |
except Exception as e:
|
243 |
return f"An error occurred: {str(e)}, TB: {traceback.format_exc()}"
|
|
|
84 |
metric_for_best_model="loss",
|
85 |
greater_is_better=True,
|
86 |
logging_dir='/data/logs',
|
87 |
+
logging_steps=210,
|
88 |
#push_to_hub=True,
|
89 |
hub_model_id=hub_id.strip(),
|
90 |
fp16=True,
|
91 |
#lr_scheduler_type='cosine',
|
92 |
+
save_steps=210, # Save checkpoint every 500 steps
|
93 |
save_total_limit=3,
|
94 |
)
|
95 |
# Check if a checkpoint exists and load it
|
|
|
234 |
#)
|
235 |
|
236 |
# Fine-tune the model
|
237 |
+
if os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir):
|
238 |
+
train_result = trainer.train(resume_from_checkpoint=True)
|
239 |
+
else:
|
240 |
+
train_result = trainer.train()
|
241 |
trainer.push_to_hub(commit_message="Training complete!")
|
242 |
except Exception as e:
|
243 |
return f"An error occurred: {str(e)}, TB: {traceback.format_exc()}"
|