Kevin Fink
commited on
Commit
·
3b756d7
1
Parent(s):
069a9a6
dev
Browse files
app.py
CHANGED
@@ -112,22 +112,22 @@ def fine_tune_model(model, dataset_name, hub_id, api_key, num_epochs, batch_size
|
|
112 |
# Load the dataset
|
113 |
dataset = load_dataset(dataset_name.strip())
|
114 |
train_size = len(dataset['train'])
|
115 |
-
|
116 |
max_length = model.get_input_embeddings().weight.shape[0]
|
117 |
try:
|
118 |
-
|
119 |
-
if 'test' in
|
120 |
-
|
121 |
-
dataset['train'] =
|
122 |
tokenized_second_half = dataset.map(tokenize_function, batched=True)
|
123 |
-
dataset['train'] = concatenate_datasets([
|
124 |
tokenized_train_dataset = dataset['train']
|
125 |
tokenized_test_dataset = dataset['test']
|
126 |
else:
|
127 |
-
|
128 |
-
dataset['train'] =
|
129 |
tokenized_sh_fq_dataset = tokenize_function(dataset, batched=True)
|
130 |
-
dataset['train'] = concatenate_datasets([
|
131 |
tokenized_half.save_to_disk(f'/data/{hub_id.strip()}_train_dataset')
|
132 |
return
|
133 |
|
@@ -142,7 +142,7 @@ def fine_tune_model(model, dataset_name, hub_id, api_key, num_epochs, batch_size
|
|
142 |
except:
|
143 |
tokenizer = AutoTokenizer.from_pretrained('google/t5-efficient-tiny-nh8')
|
144 |
# Tokenize the dataset
|
145 |
-
first_half = dataset['train'].select(range(
|
146 |
dataset['train'] = first_half
|
147 |
del dataset['test']
|
148 |
del dataset['validation']
|
|
|
112 |
# Load the dataset
|
113 |
dataset = load_dataset(dataset_name.strip())
|
114 |
train_size = len(dataset['train'])
|
115 |
+
third_size = train_size // 3
|
116 |
max_length = model.get_input_embeddings().weight.shape[0]
|
117 |
try:
|
118 |
+
saved_dataset = load_from_disk(f'/data/{hub_id.strip()}_train_dataset')
|
119 |
+
if 'test' in saved_dataset.keys():
|
120 |
+
third_third = dataset['train'].select(range(third_size*2, train_size))
|
121 |
+
dataset['train'] = third_third
|
122 |
tokenized_second_half = dataset.map(tokenize_function, batched=True)
|
123 |
+
dataset['train'] = concatenate_datasets([saved_dataset['train'], tokenized_second_half['train']])
|
124 |
tokenized_train_dataset = dataset['train']
|
125 |
tokenized_test_dataset = dataset['test']
|
126 |
else:
|
127 |
+
second_third = dataset['train'].select(range(third_size, third_size*2))
|
128 |
+
dataset['train'] = second_third
|
129 |
tokenized_sh_fq_dataset = tokenize_function(dataset, batched=True)
|
130 |
+
dataset['train'] = concatenate_datasets([saved_dataset['train'], tokenized_sh_fq_dataset['train']])
|
131 |
tokenized_half.save_to_disk(f'/data/{hub_id.strip()}_train_dataset')
|
132 |
return
|
133 |
|
|
|
142 |
except:
|
143 |
tokenizer = AutoTokenizer.from_pretrained('google/t5-efficient-tiny-nh8')
|
144 |
# Tokenize the dataset
|
145 |
+
first_half = dataset['train'].select(range(third_size))
|
146 |
dataset['train'] = first_half
|
147 |
del dataset['test']
|
148 |
del dataset['validation']
|