Kevin Fink commited on
Commit
440639a
·
1 Parent(s): 05f8623
Files changed (1) hide show
  1. app.py +4 -4
app.py CHANGED
@@ -139,7 +139,7 @@ def fine_tune_model(model, dataset_name, hub_id, api_key, num_epochs, batch_size
139
  dataset = load_dataset(dataset_name.strip())
140
  del dataset['train']
141
  del dataset['validation']
142
- test_set = dataset.map(tokenize_function, batched=True, batch_size=5)
143
  test_set['test'].save_to_disk(f'/data/{hub_id.strip()}_test_dataset')
144
  return 'TRAINING DONE'
145
 
@@ -154,7 +154,7 @@ def fine_tune_model(model, dataset_name, hub_id, api_key, num_epochs, batch_size
154
  third_third = dataset['train'].select(range(third_size*2, train_size))
155
  dataset['train'] = third_third
156
  #tokenized_second_half = tokenize_function(third_third)
157
- tokenized_second_half = dataset.map(tokenize_function, batched=True, batch_size=5)
158
  dataset['train'] = concatenate_datasets([saved_dataset, tokenized_second_half['train']])
159
  dataset['train'].save_to_disk(f'/data/{hub_id.strip()}_train_dataset3')
160
  return 'THIRD THIRD LOADED'
@@ -167,7 +167,7 @@ def fine_tune_model(model, dataset_name, hub_id, api_key, num_epochs, batch_size
167
  second_third = dataset['train'].select(range(third_size, third_size*2))
168
  dataset['train'] = second_third
169
  del dataset['test']
170
- tokenized_sh_fq_dataset = dataset.map(tokenize_function, batched=True, batch_size=5)
171
  dataset['train'] = concatenate_datasets([saved_dataset['train'], tokenized_sh_fq_dataset['train']])
172
  dataset['train'].save_to_disk(f'/data/{hub_id.strip()}_train_dataset2')
173
  dataset['validation'].save_to_disk(f'/data/{hub_id.strip()}_validation_dataset')
@@ -183,7 +183,7 @@ def fine_tune_model(model, dataset_name, hub_id, api_key, num_epochs, batch_size
183
  dataset['train'] = first_third
184
  del dataset['test']
185
  del dataset['validation']
186
- tokenized_first_third = dataset.map(tokenize_function, batched=True, batch_size=5)
187
 
188
  tokenized_first_third.save_to_disk(f'/data/{hub_id.strip()}_train_dataset')
189
  print('DONE')
 
139
  dataset = load_dataset(dataset_name.strip())
140
  del dataset['train']
141
  del dataset['validation']
142
+ test_set = dataset.map(tokenize_function, batched=True, batch_size=50)
143
  test_set['test'].save_to_disk(f'/data/{hub_id.strip()}_test_dataset')
144
  return 'TRAINING DONE'
145
 
 
154
  third_third = dataset['train'].select(range(third_size*2, train_size))
155
  dataset['train'] = third_third
156
  #tokenized_second_half = tokenize_function(third_third)
157
+ tokenized_second_half = dataset.map(tokenize_function, batched=True, batch_size=50)
158
  dataset['train'] = concatenate_datasets([saved_dataset, tokenized_second_half['train']])
159
  dataset['train'].save_to_disk(f'/data/{hub_id.strip()}_train_dataset3')
160
  return 'THIRD THIRD LOADED'
 
167
  second_third = dataset['train'].select(range(third_size, third_size*2))
168
  dataset['train'] = second_third
169
  del dataset['test']
170
+ tokenized_sh_fq_dataset = dataset.map(tokenize_function, batched=True, batch_size=50)
171
  dataset['train'] = concatenate_datasets([saved_dataset['train'], tokenized_sh_fq_dataset['train']])
172
  dataset['train'].save_to_disk(f'/data/{hub_id.strip()}_train_dataset2')
173
  dataset['validation'].save_to_disk(f'/data/{hub_id.strip()}_validation_dataset')
 
183
  dataset['train'] = first_third
184
  del dataset['test']
185
  del dataset['validation']
186
+ tokenized_first_third = dataset.map(tokenize_function, batched=True, batch_size=50)
187
 
188
  tokenized_first_third.save_to_disk(f'/data/{hub_id.strip()}_train_dataset')
189
  print('DONE')