Kevin Fink
commited on
Commit
·
27c01f2
1
Parent(s):
57412a6
dev
Browse files
app.py
CHANGED
@@ -138,7 +138,7 @@ def fine_tune_model(model, dataset_name, hub_id, api_key, num_epochs, batch_size
|
|
138 |
dataset['train'] = dataset['train'].select(range(8000))
|
139 |
del dataset['train']
|
140 |
del dataset['validation']
|
141 |
-
test_set = dataset.map(tokenize_function, batched=True, batch_size=
|
142 |
test_set['test'].save_to_disk(f'/data/{hub_id.strip()}_test_dataset')
|
143 |
return 'TRAINING DONE'
|
144 |
|
@@ -154,7 +154,7 @@ def fine_tune_model(model, dataset_name, hub_id, api_key, num_epochs, batch_size
|
|
154 |
third_third = dataset['train'].select(range(third_size*2, train_size))
|
155 |
dataset['train'] = third_third
|
156 |
#tokenized_second_half = tokenize_function(third_third)
|
157 |
-
tokenized_second_half = dataset.map(tokenize_function, batched=True, batch_size=
|
158 |
dataset['train'] = concatenate_datasets([saved_dataset, tokenized_second_half['train']])
|
159 |
dataset['train'].save_to_disk(f'/data/{hub_id.strip()}_train_dataset3')
|
160 |
return 'THIRD THIRD LOADED'
|
@@ -169,7 +169,7 @@ def fine_tune_model(model, dataset_name, hub_id, api_key, num_epochs, batch_size
|
|
169 |
second_third = dataset['train'].select(range(third_size, third_size*2))
|
170 |
dataset['train'] = second_third
|
171 |
del dataset['test']
|
172 |
-
tokenized_sh_fq_dataset = dataset.map(tokenize_function, batched=True, batch_size=
|
173 |
dataset['train'] = concatenate_datasets([saved_dataset['train'], tokenized_sh_fq_dataset['train']])
|
174 |
dataset['train'].save_to_disk(f'/data/{hub_id.strip()}_train_dataset2')
|
175 |
dataset['validation'].save_to_disk(f'/data/{hub_id.strip()}_validation_dataset')
|
@@ -186,7 +186,7 @@ def fine_tune_model(model, dataset_name, hub_id, api_key, num_epochs, batch_size
|
|
186 |
dataset['train'] = first_third
|
187 |
del dataset['test']
|
188 |
del dataset['validation']
|
189 |
-
tokenized_first_third = dataset.map(tokenize_function, batched=True, batch_size=
|
190 |
|
191 |
tokenized_first_third.save_to_disk(f'/data/{hub_id.strip()}_train_dataset')
|
192 |
print('DONE')
|
|
|
138 |
dataset['train'] = dataset['train'].select(range(8000))
|
139 |
del dataset['train']
|
140 |
del dataset['validation']
|
141 |
+
test_set = dataset.map(tokenize_function, batched=True, batch_size=5)
|
142 |
test_set['test'].save_to_disk(f'/data/{hub_id.strip()}_test_dataset')
|
143 |
return 'TRAINING DONE'
|
144 |
|
|
|
154 |
third_third = dataset['train'].select(range(third_size*2, train_size))
|
155 |
dataset['train'] = third_third
|
156 |
#tokenized_second_half = tokenize_function(third_third)
|
157 |
+
tokenized_second_half = dataset.map(tokenize_function, batched=True, batch_size=5)
|
158 |
dataset['train'] = concatenate_datasets([saved_dataset, tokenized_second_half['train']])
|
159 |
dataset['train'].save_to_disk(f'/data/{hub_id.strip()}_train_dataset3')
|
160 |
return 'THIRD THIRD LOADED'
|
|
|
169 |
second_third = dataset['train'].select(range(third_size, third_size*2))
|
170 |
dataset['train'] = second_third
|
171 |
del dataset['test']
|
172 |
+
tokenized_sh_fq_dataset = dataset.map(tokenize_function, batched=True, batch_size=5)
|
173 |
dataset['train'] = concatenate_datasets([saved_dataset['train'], tokenized_sh_fq_dataset['train']])
|
174 |
dataset['train'].save_to_disk(f'/data/{hub_id.strip()}_train_dataset2')
|
175 |
dataset['validation'].save_to_disk(f'/data/{hub_id.strip()}_validation_dataset')
|
|
|
186 |
dataset['train'] = first_third
|
187 |
del dataset['test']
|
188 |
del dataset['validation']
|
189 |
+
tokenized_first_third = dataset.map(tokenize_function, batched=True, batch_size=5)
|
190 |
|
191 |
tokenized_first_third.save_to_disk(f'/data/{hub_id.strip()}_train_dataset')
|
192 |
print('DONE')
|