Kevin Fink
commited on
Commit
·
9613a2c
1
Parent(s):
97a2943
dev
Browse files
app.py
CHANGED
@@ -131,6 +131,8 @@ def fine_tune_model(model, dataset_name, hub_id, api_key, num_epochs, batch_size
|
|
131 |
|
132 |
elif os.access(f'/data/{hub_id.strip()}_train_dataset3', os.R_OK):
|
133 |
dataset = load_dataset(dataset_name.strip())
|
|
|
|
|
134 |
del dataset['train']
|
135 |
del dataset['validation']
|
136 |
test_set = dataset.map(tokenize_function, batched=True)
|
@@ -139,6 +141,7 @@ def fine_tune_model(model, dataset_name, hub_id, api_key, num_epochs, batch_size
|
|
139 |
|
140 |
elif os.access(f'/data/{hub_id.strip()}_validation_dataset', os.R_OK):
|
141 |
dataset = load_dataset(dataset_name.strip())
|
|
|
142 |
train_size = len(dataset['train'])
|
143 |
third_size = train_size // 3
|
144 |
del dataset['test']
|
@@ -156,6 +159,8 @@ def fine_tune_model(model, dataset_name, hub_id, api_key, num_epochs, batch_size
|
|
156 |
|
157 |
if os.access(f'/data/{hub_id.strip()}_train_dataset', os.R_OK) and not os.access(f'/data/{hub_id.strip()}_train_dataset3', os.R_OK):
|
158 |
dataset = load_dataset(dataset_name.strip())
|
|
|
|
|
159 |
train_size = len(dataset['train'])
|
160 |
third_size = train_size // 3
|
161 |
second_third = dataset['train'].select(range(third_size, third_size*2))
|
@@ -170,6 +175,7 @@ def fine_tune_model(model, dataset_name, hub_id, api_key, num_epochs, batch_size
|
|
170 |
except Exception as e:
|
171 |
print(f"An error occurred: {str(e)}, TB: {traceback.format_exc()}")
|
172 |
dataset = load_dataset(dataset_name.strip())
|
|
|
173 |
train_size = len(dataset['train'])
|
174 |
third_size = train_size // 3
|
175 |
# Tokenize the dataset
|
@@ -177,7 +183,7 @@ def fine_tune_model(model, dataset_name, hub_id, api_key, num_epochs, batch_size
|
|
177 |
dataset['train'] = first_third
|
178 |
del dataset['test']
|
179 |
del dataset['validation']
|
180 |
-
tokenized_first_third = dataset.map(tokenize_function, batched=True
|
181 |
|
182 |
tokenized_first_third.save_to_disk(f'/data/{hub_id.strip()}_train_dataset')
|
183 |
print('DONE')
|
|
|
131 |
|
132 |
elif os.access(f'/data/{hub_id.strip()}_train_dataset3', os.R_OK):
|
133 |
dataset = load_dataset(dataset_name.strip())
|
134 |
+
dataset['test'] = dataset['test'].select(range(1200))
|
135 |
+
dataset['train'] = dataset['train'].select(range(12000))
|
136 |
del dataset['train']
|
137 |
del dataset['validation']
|
138 |
test_set = dataset.map(tokenize_function, batched=True)
|
|
|
141 |
|
142 |
elif os.access(f'/data/{hub_id.strip()}_validation_dataset', os.R_OK):
|
143 |
dataset = load_dataset(dataset_name.strip())
|
144 |
+
dataset['train'] = dataset['train'].select(range(12000))
|
145 |
train_size = len(dataset['train'])
|
146 |
third_size = train_size // 3
|
147 |
del dataset['test']
|
|
|
159 |
|
160 |
if os.access(f'/data/{hub_id.strip()}_train_dataset', os.R_OK) and not os.access(f'/data/{hub_id.strip()}_train_dataset3', os.R_OK):
|
161 |
dataset = load_dataset(dataset_name.strip())
|
162 |
+
dataset['train'] = dataset['train'].select(range(12000))
|
163 |
+
dataset['validation'] = dataset['validation'].select(range(200))
|
164 |
train_size = len(dataset['train'])
|
165 |
third_size = train_size // 3
|
166 |
second_third = dataset['train'].select(range(third_size, third_size*2))
|
|
|
175 |
except Exception as e:
|
176 |
print(f"An error occurred: {str(e)}, TB: {traceback.format_exc()}")
|
177 |
dataset = load_dataset(dataset_name.strip())
|
178 |
+
dataset['train'] = dataset['train'].select(range(12000))
|
179 |
train_size = len(dataset['train'])
|
180 |
third_size = train_size // 3
|
181 |
# Tokenize the dataset
|
|
|
183 |
dataset['train'] = first_third
|
184 |
del dataset['test']
|
185 |
del dataset['validation']
|
186 |
+
tokenized_first_third = dataset.map(tokenize_function, batched=True)
|
187 |
|
188 |
tokenized_first_third.save_to_disk(f'/data/{hub_id.strip()}_train_dataset')
|
189 |
print('DONE')
|