Kevin Fink commited on
Commit
db67174
·
1 Parent(s): da5b30a
Files changed (1) hide show
  1. app.py +10 -13
app.py CHANGED
@@ -145,9 +145,8 @@ def fine_tune_model(model, dataset_name, hub_id, api_key, num_epochs, batch_size
145
  dataset['test'] = dataset['test'].select(range(50))
146
  del dataset['train']
147
  del dataset['validation']
148
- test_set = tokenize_function(dataset['test'])
149
- dataset['test'] =test_set
150
- dataset['test'].save_to_disk(f'/data/{hub_id.strip()}_test_dataset')
151
  return 'TRAINING DONE'
152
 
153
  elif os.access(f'/data/{hub_id.strip()}_validation_dataset', os.R_OK):
@@ -162,8 +161,9 @@ def fine_tune_model(model, dataset_name, hub_id, api_key, num_epochs, batch_size
162
  saved_dataset = load_from_disk(f'/data/{hub_id.strip()}_train_dataset2')
163
  third_third = dataset['train'].select(range(third_size*2, train_size))
164
  dataset['train'] = third_third
165
- train_set_3 = tokenize_function(dataset['train'])
166
- dataset['train'] = concatenate_datasets([saved_dataset, train_set_3])
 
167
  dataset['train'].save_to_disk(f'/data/{hub_id.strip()}_train_dataset3')
168
  return 'THIRD THIRD LOADED'
169
 
@@ -179,13 +179,10 @@ def fine_tune_model(model, dataset_name, hub_id, api_key, num_epochs, batch_size
179
  second_third = dataset['train'].select(range(third_size, third_size*2))
180
  dataset['train'] = second_third
181
  del dataset['test']
182
- train_set_2 = tokenize_function(dataset['train'])
183
- validation_set = tokenize_function(dataset['validation'])
184
- dataset['validation'] = validation_set
185
- dataset['train'] = concatenate_datasets([saved_dataset['train'], train_set_2])
186
  dataset['train'].save_to_disk(f'/data/{hub_id.strip()}_train_dataset2')
187
  dataset['validation'].save_to_disk(f'/data/{hub_id.strip()}_validation_dataset')
188
-
189
  return 'SECOND THIRD LOADED'
190
 
191
  except Exception as e:
@@ -200,9 +197,9 @@ def fine_tune_model(model, dataset_name, hub_id, api_key, num_epochs, batch_size
200
  dataset['train'] = first_third
201
  del dataset['test']
202
  del dataset['validation']
203
- train_set = tokenize_function(dataset['train'])
204
- dataset['train'] = train_set
205
- dataset['train'].save_to_disk(f'/data/{hub_id.strip()}_train_dataset')
206
  print('DONE')
207
  return 'RUN AGAIN TO LOAD REST OF DATA'
208
 
 
145
  dataset['test'] = dataset['test'].select(range(50))
146
  del dataset['train']
147
  del dataset['validation']
148
+ test_set = dataset.map(tokenize_function, batched=True, batch_size=50, remove_columns=column_names,)
149
+ test_set['test'].save_to_disk(f'/data/{hub_id.strip()}_test_dataset')
 
150
  return 'TRAINING DONE'
151
 
152
  elif os.access(f'/data/{hub_id.strip()}_validation_dataset', os.R_OK):
 
161
  saved_dataset = load_from_disk(f'/data/{hub_id.strip()}_train_dataset2')
162
  third_third = dataset['train'].select(range(third_size*2, train_size))
163
  dataset['train'] = third_third
164
+ #tokenized_second_half = tokenize_function(third_third)
165
+ tokenized_second_half = dataset.map(tokenize_function, batched=True, batch_size=50,remove_columns=column_names,)
166
+ dataset['train'] = concatenate_datasets([saved_dataset, tokenized_second_half['train']])
167
  dataset['train'].save_to_disk(f'/data/{hub_id.strip()}_train_dataset3')
168
  return 'THIRD THIRD LOADED'
169
 
 
179
  second_third = dataset['train'].select(range(third_size, third_size*2))
180
  dataset['train'] = second_third
181
  del dataset['test']
182
+ tokenized_sh_fq_dataset = dataset.map(tokenize_function, batched=True, batch_size=50, remove_columns=column_names,)
183
+ dataset['train'] = concatenate_datasets([saved_dataset['train'], tokenized_sh_fq_dataset['train']])
 
 
184
  dataset['train'].save_to_disk(f'/data/{hub_id.strip()}_train_dataset2')
185
  dataset['validation'].save_to_disk(f'/data/{hub_id.strip()}_validation_dataset')
 
186
  return 'SECOND THIRD LOADED'
187
 
188
  except Exception as e:
 
197
  dataset['train'] = first_third
198
  del dataset['test']
199
  del dataset['validation']
200
+ tokenized_first_third = dataset.map(tokenize_function, batched=True, batch_size=50, remove_columns=column_names,)
201
+
202
+ tokenized_first_third.save_to_disk(f'/data/{hub_id.strip()}_train_dataset')
203
  print('DONE')
204
  return 'RUN AGAIN TO LOAD REST OF DATA'
205