Kevin Fink
commited on
Commit
·
db67174
1
Parent(s):
da5b30a
dev
Browse files
app.py
CHANGED
@@ -145,9 +145,8 @@ def fine_tune_model(model, dataset_name, hub_id, api_key, num_epochs, batch_size
|
|
145 |
dataset['test'] = dataset['test'].select(range(50))
|
146 |
del dataset['train']
|
147 |
del dataset['validation']
|
148 |
-
test_set = tokenize_function
|
149 |
-
|
150 |
-
dataset['test'].save_to_disk(f'/data/{hub_id.strip()}_test_dataset')
|
151 |
return 'TRAINING DONE'
|
152 |
|
153 |
elif os.access(f'/data/{hub_id.strip()}_validation_dataset', os.R_OK):
|
@@ -162,8 +161,9 @@ def fine_tune_model(model, dataset_name, hub_id, api_key, num_epochs, batch_size
|
|
162 |
saved_dataset = load_from_disk(f'/data/{hub_id.strip()}_train_dataset2')
|
163 |
third_third = dataset['train'].select(range(third_size*2, train_size))
|
164 |
dataset['train'] = third_third
|
165 |
-
|
166 |
-
|
|
|
167 |
dataset['train'].save_to_disk(f'/data/{hub_id.strip()}_train_dataset3')
|
168 |
return 'THIRD THIRD LOADED'
|
169 |
|
@@ -179,13 +179,10 @@ def fine_tune_model(model, dataset_name, hub_id, api_key, num_epochs, batch_size
|
|
179 |
second_third = dataset['train'].select(range(third_size, third_size*2))
|
180 |
dataset['train'] = second_third
|
181 |
del dataset['test']
|
182 |
-
|
183 |
-
|
184 |
-
dataset['validation'] = validation_set
|
185 |
-
dataset['train'] = concatenate_datasets([saved_dataset['train'], train_set_2])
|
186 |
dataset['train'].save_to_disk(f'/data/{hub_id.strip()}_train_dataset2')
|
187 |
dataset['validation'].save_to_disk(f'/data/{hub_id.strip()}_validation_dataset')
|
188 |
-
|
189 |
return 'SECOND THIRD LOADED'
|
190 |
|
191 |
except Exception as e:
|
@@ -200,9 +197,9 @@ def fine_tune_model(model, dataset_name, hub_id, api_key, num_epochs, batch_size
|
|
200 |
dataset['train'] = first_third
|
201 |
del dataset['test']
|
202 |
del dataset['validation']
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
print('DONE')
|
207 |
return 'RUN AGAIN TO LOAD REST OF DATA'
|
208 |
|
|
|
145 |
dataset['test'] = dataset['test'].select(range(50))
|
146 |
del dataset['train']
|
147 |
del dataset['validation']
|
148 |
+
test_set = dataset.map(tokenize_function, batched=True, batch_size=50, remove_columns=column_names,)
|
149 |
+
test_set['test'].save_to_disk(f'/data/{hub_id.strip()}_test_dataset')
|
|
|
150 |
return 'TRAINING DONE'
|
151 |
|
152 |
elif os.access(f'/data/{hub_id.strip()}_validation_dataset', os.R_OK):
|
|
|
161 |
saved_dataset = load_from_disk(f'/data/{hub_id.strip()}_train_dataset2')
|
162 |
third_third = dataset['train'].select(range(third_size*2, train_size))
|
163 |
dataset['train'] = third_third
|
164 |
+
#tokenized_second_half = tokenize_function(third_third)
|
165 |
+
tokenized_second_half = dataset.map(tokenize_function, batched=True, batch_size=50,remove_columns=column_names,)
|
166 |
+
dataset['train'] = concatenate_datasets([saved_dataset, tokenized_second_half['train']])
|
167 |
dataset['train'].save_to_disk(f'/data/{hub_id.strip()}_train_dataset3')
|
168 |
return 'THIRD THIRD LOADED'
|
169 |
|
|
|
179 |
second_third = dataset['train'].select(range(third_size, third_size*2))
|
180 |
dataset['train'] = second_third
|
181 |
del dataset['test']
|
182 |
+
tokenized_sh_fq_dataset = dataset.map(tokenize_function, batched=True, batch_size=50, remove_columns=column_names,)
|
183 |
+
dataset['train'] = concatenate_datasets([saved_dataset['train'], tokenized_sh_fq_dataset['train']])
|
|
|
|
|
184 |
dataset['train'].save_to_disk(f'/data/{hub_id.strip()}_train_dataset2')
|
185 |
dataset['validation'].save_to_disk(f'/data/{hub_id.strip()}_validation_dataset')
|
|
|
186 |
return 'SECOND THIRD LOADED'
|
187 |
|
188 |
except Exception as e:
|
|
|
197 |
dataset['train'] = first_third
|
198 |
del dataset['test']
|
199 |
del dataset['validation']
|
200 |
+
tokenized_first_third = dataset.map(tokenize_function, batched=True, batch_size=50, remove_columns=column_names,)
|
201 |
+
|
202 |
+
tokenized_first_third.save_to_disk(f'/data/{hub_id.strip()}_train_dataset')
|
203 |
print('DONE')
|
204 |
return 'RUN AGAIN TO LOAD REST OF DATA'
|
205 |
|