Kevin Fink commited on
Commit
27c01f2
·
1 Parent(s): 57412a6
Files changed (1) hide show
  1. app.py +4 -4
app.py CHANGED
@@ -138,7 +138,7 @@ def fine_tune_model(model, dataset_name, hub_id, api_key, num_epochs, batch_size
138
  dataset['train'] = dataset['train'].select(range(8000))
139
  del dataset['train']
140
  del dataset['validation']
141
- test_set = dataset.map(tokenize_function, batched=True, batch_size=20)
142
  test_set['test'].save_to_disk(f'/data/{hub_id.strip()}_test_dataset')
143
  return 'TRAINING DONE'
144
 
@@ -154,7 +154,7 @@ def fine_tune_model(model, dataset_name, hub_id, api_key, num_epochs, batch_size
154
  third_third = dataset['train'].select(range(third_size*2, train_size))
155
  dataset['train'] = third_third
156
  #tokenized_second_half = tokenize_function(third_third)
157
- tokenized_second_half = dataset.map(tokenize_function, batched=True, batch_size=20)
158
  dataset['train'] = concatenate_datasets([saved_dataset, tokenized_second_half['train']])
159
  dataset['train'].save_to_disk(f'/data/{hub_id.strip()}_train_dataset3')
160
  return 'THIRD THIRD LOADED'
@@ -169,7 +169,7 @@ def fine_tune_model(model, dataset_name, hub_id, api_key, num_epochs, batch_size
169
  second_third = dataset['train'].select(range(third_size, third_size*2))
170
  dataset['train'] = second_third
171
  del dataset['test']
172
- tokenized_sh_fq_dataset = dataset.map(tokenize_function, batched=True, batch_size=20)
173
  dataset['train'] = concatenate_datasets([saved_dataset['train'], tokenized_sh_fq_dataset['train']])
174
  dataset['train'].save_to_disk(f'/data/{hub_id.strip()}_train_dataset2')
175
  dataset['validation'].save_to_disk(f'/data/{hub_id.strip()}_validation_dataset')
@@ -186,7 +186,7 @@ def fine_tune_model(model, dataset_name, hub_id, api_key, num_epochs, batch_size
186
  dataset['train'] = first_third
187
  del dataset['test']
188
  del dataset['validation']
189
- tokenized_first_third = dataset.map(tokenize_function, batched=True, batch_size=20)
190
 
191
  tokenized_first_third.save_to_disk(f'/data/{hub_id.strip()}_train_dataset')
192
  print('DONE')
 
138
  dataset['train'] = dataset['train'].select(range(8000))
139
  del dataset['train']
140
  del dataset['validation']
141
+ test_set = dataset.map(tokenize_function, batched=True, batch_size=5)
142
  test_set['test'].save_to_disk(f'/data/{hub_id.strip()}_test_dataset')
143
  return 'TRAINING DONE'
144
 
 
154
  third_third = dataset['train'].select(range(third_size*2, train_size))
155
  dataset['train'] = third_third
156
  #tokenized_second_half = tokenize_function(third_third)
157
+ tokenized_second_half = dataset.map(tokenize_function, batched=True, batch_size=5)
158
  dataset['train'] = concatenate_datasets([saved_dataset, tokenized_second_half['train']])
159
  dataset['train'].save_to_disk(f'/data/{hub_id.strip()}_train_dataset3')
160
  return 'THIRD THIRD LOADED'
 
169
  second_third = dataset['train'].select(range(third_size, third_size*2))
170
  dataset['train'] = second_third
171
  del dataset['test']
172
+ tokenized_sh_fq_dataset = dataset.map(tokenize_function, batched=True, batch_size=5)
173
  dataset['train'] = concatenate_datasets([saved_dataset['train'], tokenized_sh_fq_dataset['train']])
174
  dataset['train'].save_to_disk(f'/data/{hub_id.strip()}_train_dataset2')
175
  dataset['validation'].save_to_disk(f'/data/{hub_id.strip()}_validation_dataset')
 
186
  dataset['train'] = first_third
187
  del dataset['test']
188
  del dataset['validation']
189
+ tokenized_first_third = dataset.map(tokenize_function, batched=True, batch_size=5)
190
 
191
  tokenized_first_third.save_to_disk(f'/data/{hub_id.strip()}_train_dataset')
192
  print('DONE')