Kevin Fink
commited on
Commit
·
f06d0fa
1
Parent(s):
744bfc7
dev
Browse files
app.py
CHANGED
@@ -55,6 +55,8 @@ def fine_tune_model(model, dataset_name, hub_id, api_key, num_epochs, batch_size
|
|
55 |
|
56 |
# Set training arguments
|
57 |
training_args = TrainingArguments(
|
|
|
|
|
58 |
output_dir='/data/results',
|
59 |
eval_strategy="steps", # Change this to steps
|
60 |
save_strategy='steps',
|
@@ -69,18 +71,18 @@ def fine_tune_model(model, dataset_name, hub_id, api_key, num_epochs, batch_size
|
|
69 |
metric_for_best_model="accuracy",
|
70 |
greater_is_better=True,
|
71 |
logging_dir='/data/logs',
|
72 |
-
logging_steps=
|
73 |
#push_to_hub=True,
|
74 |
hub_model_id=hub_id.strip(),
|
75 |
fp16=True,
|
76 |
#lr_scheduler_type='cosine',
|
77 |
-
save_steps=
|
78 |
save_total_limit=3,
|
79 |
)
|
80 |
# Check if a checkpoint exists and load it
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
|
85 |
tokenizer = AutoTokenizer.from_pretrained('google/t5-efficient-tiny-nh8')
|
86 |
|
@@ -136,7 +138,7 @@ def fine_tune_model(model, dataset_name, hub_id, api_key, num_epochs, batch_size
|
|
136 |
dataset['train'] = dataset['train'].select(range(8000))
|
137 |
del dataset['train']
|
138 |
del dataset['validation']
|
139 |
-
test_set = dataset.map(tokenize_function, batched=True)
|
140 |
test_set['test'].save_to_disk(f'/data/{hub_id.strip()}_test_dataset')
|
141 |
return 'TRAINING DONE'
|
142 |
|
@@ -152,7 +154,7 @@ def fine_tune_model(model, dataset_name, hub_id, api_key, num_epochs, batch_size
|
|
152 |
third_third = dataset['train'].select(range(third_size*2, train_size))
|
153 |
dataset['train'] = third_third
|
154 |
#tokenized_second_half = tokenize_function(third_third)
|
155 |
-
tokenized_second_half = dataset.map(tokenize_function, batched=True)
|
156 |
dataset['train'] = concatenate_datasets([saved_dataset, tokenized_second_half['train']])
|
157 |
dataset['train'].save_to_disk(f'/data/{hub_id.strip()}_train_dataset3')
|
158 |
return 'THIRD THIRD LOADED'
|
@@ -167,7 +169,7 @@ def fine_tune_model(model, dataset_name, hub_id, api_key, num_epochs, batch_size
|
|
167 |
second_third = dataset['train'].select(range(third_size, third_size*2))
|
168 |
dataset['train'] = second_third
|
169 |
del dataset['test']
|
170 |
-
tokenized_sh_fq_dataset = dataset.map(tokenize_function, batched=True)
|
171 |
dataset['train'] = concatenate_datasets([saved_dataset['train'], tokenized_sh_fq_dataset['train']])
|
172 |
dataset['train'].save_to_disk(f'/data/{hub_id.strip()}_train_dataset2')
|
173 |
dataset['validation'].save_to_disk(f'/data/{hub_id.strip()}_validation_dataset')
|
@@ -184,15 +186,15 @@ def fine_tune_model(model, dataset_name, hub_id, api_key, num_epochs, batch_size
|
|
184 |
dataset['train'] = first_third
|
185 |
del dataset['test']
|
186 |
del dataset['validation']
|
187 |
-
tokenized_first_third = dataset.map(tokenize_function, batched=True)
|
188 |
|
189 |
tokenized_first_third.save_to_disk(f'/data/{hub_id.strip()}_train_dataset')
|
190 |
print('DONE')
|
191 |
return 'RUN AGAIN TO LOAD REST OF DATA'
|
192 |
|
193 |
# Fine-tune the model
|
194 |
-
|
195 |
-
|
196 |
else:
|
197 |
train_result = trainer.train()
|
198 |
trainer.push_to_hub(commit_message="Training complete!")
|
|
|
55 |
|
56 |
# Set training arguments
|
57 |
training_args = TrainingArguments(
|
58 |
+
torch_empty_cache_steps=150,
|
59 |
+
overwrite_output_dir=True,
|
60 |
output_dir='/data/results',
|
61 |
eval_strategy="steps", # Change this to steps
|
62 |
save_strategy='steps',
|
|
|
71 |
metric_for_best_model="accuracy",
|
72 |
greater_is_better=True,
|
73 |
logging_dir='/data/logs',
|
74 |
+
logging_steps=250,
|
75 |
#push_to_hub=True,
|
76 |
hub_model_id=hub_id.strip(),
|
77 |
fp16=True,
|
78 |
#lr_scheduler_type='cosine',
|
79 |
+
save_steps=350, # Save checkpoint every 500 steps
|
80 |
save_total_limit=3,
|
81 |
)
|
82 |
# Check if a checkpoint exists and load it
|
83 |
+
if os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir):
|
84 |
+
print("Loading model from checkpoint...")
|
85 |
+
model = AutoModelForSeq2SeqLM.from_pretrained(training_args.output_dir)
|
86 |
|
87 |
tokenizer = AutoTokenizer.from_pretrained('google/t5-efficient-tiny-nh8')
|
88 |
|
|
|
138 |
dataset['train'] = dataset['train'].select(range(8000))
|
139 |
del dataset['train']
|
140 |
del dataset['validation']
|
141 |
+
test_set = dataset.map(tokenize_function, batched=True, batch_size=20)
|
142 |
test_set['test'].save_to_disk(f'/data/{hub_id.strip()}_test_dataset')
|
143 |
return 'TRAINING DONE'
|
144 |
|
|
|
154 |
third_third = dataset['train'].select(range(third_size*2, train_size))
|
155 |
dataset['train'] = third_third
|
156 |
#tokenized_second_half = tokenize_function(third_third)
|
157 |
+
tokenized_second_half = dataset.map(tokenize_function, batched=True, batch_size=20)
|
158 |
dataset['train'] = concatenate_datasets([saved_dataset, tokenized_second_half['train']])
|
159 |
dataset['train'].save_to_disk(f'/data/{hub_id.strip()}_train_dataset3')
|
160 |
return 'THIRD THIRD LOADED'
|
|
|
169 |
second_third = dataset['train'].select(range(third_size, third_size*2))
|
170 |
dataset['train'] = second_third
|
171 |
del dataset['test']
|
172 |
+
tokenized_sh_fq_dataset = dataset.map(tokenize_function, batched=True, batch_size=20)
|
173 |
dataset['train'] = concatenate_datasets([saved_dataset['train'], tokenized_sh_fq_dataset['train']])
|
174 |
dataset['train'].save_to_disk(f'/data/{hub_id.strip()}_train_dataset2')
|
175 |
dataset['validation'].save_to_disk(f'/data/{hub_id.strip()}_validation_dataset')
|
|
|
186 |
dataset['train'] = first_third
|
187 |
del dataset['test']
|
188 |
del dataset['validation']
|
189 |
+
tokenized_first_third = dataset.map(tokenize_function, batched=True, batch_size=20)
|
190 |
|
191 |
tokenized_first_third.save_to_disk(f'/data/{hub_id.strip()}_train_dataset')
|
192 |
print('DONE')
|
193 |
return 'RUN AGAIN TO LOAD REST OF DATA'
|
194 |
|
195 |
# Fine-tune the model
|
196 |
+
if os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir):
|
197 |
+
train_result = trainer.train(resume_from_checkpoint=True)
|
198 |
else:
|
199 |
train_result = trainer.train()
|
200 |
trainer.push_to_hub(commit_message="Training complete!")
|