Kevin Fink
commited on
Commit
·
8849792
1
Parent(s):
ae2e833
dev
Browse files
app.py
CHANGED
@@ -142,13 +142,15 @@ def fine_tune_model(model, dataset_name, hub_id, api_key, num_epochs, batch_size
|
|
142 |
except:
|
143 |
dataset = load_dataset(dataset_name.strip())
|
144 |
train_size = len(dataset['train'])
|
145 |
-
third_size = train_size // 3
|
|
|
|
|
146 |
print("FOUND VALIDATION")
|
147 |
saved_dataset = load_from_disk(f'/data/{hub_id.strip()}_train_dataset2')
|
148 |
third_third = dataset['train'].select(range(third_size*2, train_size))
|
149 |
dataset['train'] = third_third
|
150 |
-
|
151 |
-
|
152 |
tokenized_second_half = dataset.map(tokenize_function, batched=True)
|
153 |
dataset['train'] = concatenate_datasets([saved_dataset['train'], tokenized_second_half['train']])
|
154 |
dataset['train'].save_to_disk(f'/data/{hub_id.strip()}_train_dataset3')
|
|
|
142 |
except:
|
143 |
dataset = load_dataset(dataset_name.strip())
|
144 |
train_size = len(dataset['train'])
|
145 |
+
third_size = train_size // 3
|
146 |
+
del dataset['test']
|
147 |
+
del dataset['validation']
|
148 |
print("FOUND VALIDATION")
|
149 |
saved_dataset = load_from_disk(f'/data/{hub_id.strip()}_train_dataset2')
|
150 |
third_third = dataset['train'].select(range(third_size*2, train_size))
|
151 |
dataset['train'] = third_third
|
152 |
+
print(dataset)
|
153 |
+
print(dataset.keys())
|
154 |
tokenized_second_half = dataset.map(tokenize_function, batched=True)
|
155 |
dataset['train'] = concatenate_datasets([saved_dataset['train'], tokenized_second_half['train']])
|
156 |
dataset['train'].save_to_disk(f'/data/{hub_id.strip()}_train_dataset3')
|