Kevin Fink
commited on
Commit
·
b6a7390
1
Parent(s):
7f8fbf8
dev
Browse files
app.py
CHANGED
@@ -103,7 +103,7 @@ def fine_tune_model(model, dataset_name, hub_id, api_key, num_epochs, batch_size
|
|
103 |
# Setup the decoder input IDs (shifted right)
|
104 |
labels = tokenizer(
|
105 |
examples['target'],
|
106 |
-
max_length=
|
107 |
truncation=True,
|
108 |
padding='max_length',
|
109 |
#text_target=examples['target'],
|
@@ -140,7 +140,8 @@ def fine_tune_model(model, dataset_name, hub_id, api_key, num_epochs, batch_size
|
|
140 |
|
141 |
elif os.access(f'/data/{hub_id.strip()}_train_dataset3', os.R_OK):
|
142 |
dataset = load_dataset(dataset_name.strip())
|
143 |
-
dataset['test'] = dataset['test'].select(range(700))
|
|
|
144 |
del dataset['train']
|
145 |
del dataset['validation']
|
146 |
test_set = dataset.map(tokenize_function, batched=True, batch_size=50, remove_columns=column_names,)
|
@@ -150,6 +151,7 @@ def fine_tune_model(model, dataset_name, hub_id, api_key, num_epochs, batch_size
|
|
150 |
elif os.access(f'/data/{hub_id.strip()}_validation_dataset', os.R_OK):
|
151 |
dataset = load_dataset(dataset_name.strip())
|
152 |
dataset['train'] = dataset['train'].select(range(8000))
|
|
|
153 |
train_size = len(dataset['train'])
|
154 |
third_size = train_size // 3
|
155 |
del dataset['test']
|
@@ -167,8 +169,10 @@ def fine_tune_model(model, dataset_name, hub_id, api_key, num_epochs, batch_size
|
|
167 |
|
168 |
if os.access(f'/data/{hub_id.strip()}_train_dataset', os.R_OK) and not os.access(f'/data/{hub_id.strip()}_train_dataset3', os.R_OK):
|
169 |
dataset = load_dataset(dataset_name.strip())
|
170 |
-
dataset['train'] = dataset['train'].select(range(
|
171 |
-
dataset['validation'] = dataset['validation'].select(range(
|
|
|
|
|
172 |
train_size = len(dataset['train'])
|
173 |
third_size = train_size // 3
|
174 |
second_third = dataset['train'].select(range(third_size, third_size*2))
|
@@ -183,7 +187,8 @@ def fine_tune_model(model, dataset_name, hub_id, api_key, num_epochs, batch_size
|
|
183 |
except Exception as e:
|
184 |
print(f"An error occurred: {str(e)}, TB: {traceback.format_exc()}")
|
185 |
dataset = load_dataset(dataset_name.strip())
|
186 |
-
dataset['train'] = dataset['train'].select(range(8000))
|
|
|
187 |
train_size = len(dataset['train'])
|
188 |
third_size = train_size // 3
|
189 |
# Tokenize the dataset
|
@@ -198,11 +203,12 @@ def fine_tune_model(model, dataset_name, hub_id, api_key, num_epochs, batch_size
|
|
198 |
return 'RUN AGAIN TO LOAD REST OF DATA'
|
199 |
|
200 |
# Fine-tune the model
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
|
|
206 |
except Exception as e:
|
207 |
return f"An error occurred: {str(e)}, TB: {traceback.format_exc()}"
|
208 |
return 'DONE!'#train_result
|
|
|
103 |
# Setup the decoder input IDs (shifted right)
|
104 |
labels = tokenizer(
|
105 |
examples['target'],
|
106 |
+
max_length=128, # Set to None for dynamic padding
|
107 |
truncation=True,
|
108 |
padding='max_length',
|
109 |
#text_target=examples['target'],
|
|
|
140 |
|
141 |
elif os.access(f'/data/{hub_id.strip()}_train_dataset3', os.R_OK):
|
142 |
dataset = load_dataset(dataset_name.strip())
|
143 |
+
#dataset['test'] = dataset['test'].select(range(700))
|
144 |
+
dataset['test'] = dataset['test'].select(range(50))
|
145 |
del dataset['train']
|
146 |
del dataset['validation']
|
147 |
test_set = dataset.map(tokenize_function, batched=True, batch_size=50, remove_columns=column_names,)
|
|
|
151 |
elif os.access(f'/data/{hub_id.strip()}_validation_dataset', os.R_OK):
|
152 |
dataset = load_dataset(dataset_name.strip())
|
153 |
dataset['train'] = dataset['train'].select(range(8000))
|
154 |
+
dataset['train'] = dataset['train'].select(range(1000))
|
155 |
train_size = len(dataset['train'])
|
156 |
third_size = train_size // 3
|
157 |
del dataset['test']
|
|
|
169 |
|
170 |
if os.access(f'/data/{hub_id.strip()}_train_dataset', os.R_OK) and not os.access(f'/data/{hub_id.strip()}_train_dataset3', os.R_OK):
|
171 |
dataset = load_dataset(dataset_name.strip())
|
172 |
+
dataset['train'] = dataset['train'].select(range(1000))
|
173 |
+
dataset['validation'] = dataset['validation'].select(range(100))
|
174 |
+
#dataset['train'] = dataset['train'].select(range(8000))
|
175 |
+
#dataset['validation'] = dataset['validation'].select(range(300))
|
176 |
train_size = len(dataset['train'])
|
177 |
third_size = train_size // 3
|
178 |
second_third = dataset['train'].select(range(third_size, third_size*2))
|
|
|
187 |
except Exception as e:
|
188 |
print(f"An error occurred: {str(e)}, TB: {traceback.format_exc()}")
|
189 |
dataset = load_dataset(dataset_name.strip())
|
190 |
+
#dataset['train'] = dataset['train'].select(range(8000))
|
191 |
+
dataset['train'] = dataset['train'].select(range(1000))
|
192 |
train_size = len(dataset['train'])
|
193 |
third_size = train_size // 3
|
194 |
# Tokenize the dataset
|
|
|
203 |
return 'RUN AGAIN TO LOAD REST OF DATA'
|
204 |
|
205 |
# Fine-tune the model
|
206 |
+
trainer.evaluate()
|
207 |
+
#if os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir):
|
208 |
+
#train_result = trainer.train(resume_from_checkpoint=True)
|
209 |
+
#else:
|
210 |
+
#train_result = trainer.train()
|
211 |
+
#trainer.push_to_hub(commit_message="Training complete!")
|
212 |
except Exception as e:
|
213 |
return f"An error occurred: {str(e)}, TB: {traceback.format_exc()}"
|
214 |
return 'DONE!'#train_result
|