Kevin Fink commited on
Commit
b6a7390
·
1 Parent(s): 7f8fbf8
Files changed (1) hide show
  1. app.py +16 -10
app.py CHANGED
@@ -103,7 +103,7 @@ def fine_tune_model(model, dataset_name, hub_id, api_key, num_epochs, batch_size
103
  # Setup the decoder input IDs (shifted right)
104
  labels = tokenizer(
105
  examples['target'],
106
- max_length=max_length, # Set to None for dynamic padding
107
  truncation=True,
108
  padding='max_length',
109
  #text_target=examples['target'],
@@ -140,7 +140,8 @@ def fine_tune_model(model, dataset_name, hub_id, api_key, num_epochs, batch_size
140
 
141
  elif os.access(f'/data/{hub_id.strip()}_train_dataset3', os.R_OK):
142
  dataset = load_dataset(dataset_name.strip())
143
- dataset['test'] = dataset['test'].select(range(700))
 
144
  del dataset['train']
145
  del dataset['validation']
146
  test_set = dataset.map(tokenize_function, batched=True, batch_size=50, remove_columns=column_names,)
@@ -150,6 +151,7 @@ def fine_tune_model(model, dataset_name, hub_id, api_key, num_epochs, batch_size
150
  elif os.access(f'/data/{hub_id.strip()}_validation_dataset', os.R_OK):
151
  dataset = load_dataset(dataset_name.strip())
152
  dataset['train'] = dataset['train'].select(range(8000))
 
153
  train_size = len(dataset['train'])
154
  third_size = train_size // 3
155
  del dataset['test']
@@ -167,8 +169,10 @@ def fine_tune_model(model, dataset_name, hub_id, api_key, num_epochs, batch_size
167
 
168
  if os.access(f'/data/{hub_id.strip()}_train_dataset', os.R_OK) and not os.access(f'/data/{hub_id.strip()}_train_dataset3', os.R_OK):
169
  dataset = load_dataset(dataset_name.strip())
170
- dataset['train'] = dataset['train'].select(range(8000))
171
- dataset['validation'] = dataset['validation'].select(range(300))
 
 
172
  train_size = len(dataset['train'])
173
  third_size = train_size // 3
174
  second_third = dataset['train'].select(range(third_size, third_size*2))
@@ -183,7 +187,8 @@ def fine_tune_model(model, dataset_name, hub_id, api_key, num_epochs, batch_size
183
  except Exception as e:
184
  print(f"An error occurred: {str(e)}, TB: {traceback.format_exc()}")
185
  dataset = load_dataset(dataset_name.strip())
186
- dataset['train'] = dataset['train'].select(range(8000))
 
187
  train_size = len(dataset['train'])
188
  third_size = train_size // 3
189
  # Tokenize the dataset
@@ -198,11 +203,12 @@ def fine_tune_model(model, dataset_name, hub_id, api_key, num_epochs, batch_size
198
  return 'RUN AGAIN TO LOAD REST OF DATA'
199
 
200
  # Fine-tune the model
201
- if os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir):
202
- train_result = trainer.train(resume_from_checkpoint=True)
203
- else:
204
- train_result = trainer.train()
205
- trainer.push_to_hub(commit_message="Training complete!")
 
206
  except Exception as e:
207
  return f"An error occurred: {str(e)}, TB: {traceback.format_exc()}"
208
  return 'DONE!'#train_result
 
103
  # Setup the decoder input IDs (shifted right)
104
  labels = tokenizer(
105
  examples['target'],
106
+ max_length=128, # Set to None for dynamic padding
107
  truncation=True,
108
  padding='max_length',
109
  #text_target=examples['target'],
 
140
 
141
  elif os.access(f'/data/{hub_id.strip()}_train_dataset3', os.R_OK):
142
  dataset = load_dataset(dataset_name.strip())
143
+ #dataset['test'] = dataset['test'].select(range(700))
144
+ dataset['test'] = dataset['test'].select(range(50))
145
  del dataset['train']
146
  del dataset['validation']
147
  test_set = dataset.map(tokenize_function, batched=True, batch_size=50, remove_columns=column_names,)
 
151
  elif os.access(f'/data/{hub_id.strip()}_validation_dataset', os.R_OK):
152
  dataset = load_dataset(dataset_name.strip())
153
  dataset['train'] = dataset['train'].select(range(8000))
154
+ dataset['train'] = dataset['train'].select(range(1000))
155
  train_size = len(dataset['train'])
156
  third_size = train_size // 3
157
  del dataset['test']
 
169
 
170
  if os.access(f'/data/{hub_id.strip()}_train_dataset', os.R_OK) and not os.access(f'/data/{hub_id.strip()}_train_dataset3', os.R_OK):
171
  dataset = load_dataset(dataset_name.strip())
172
+ dataset['train'] = dataset['train'].select(range(1000))
173
+ dataset['validation'] = dataset['validation'].select(range(100))
174
+ #dataset['train'] = dataset['train'].select(range(8000))
175
+ #dataset['validation'] = dataset['validation'].select(range(300))
176
  train_size = len(dataset['train'])
177
  third_size = train_size // 3
178
  second_third = dataset['train'].select(range(third_size, third_size*2))
 
187
  except Exception as e:
188
  print(f"An error occurred: {str(e)}, TB: {traceback.format_exc()}")
189
  dataset = load_dataset(dataset_name.strip())
190
+ #dataset['train'] = dataset['train'].select(range(8000))
191
+ dataset['train'] = dataset['train'].select(range(1000))
192
  train_size = len(dataset['train'])
193
  third_size = train_size // 3
194
  # Tokenize the dataset
 
203
  return 'RUN AGAIN TO LOAD REST OF DATA'
204
 
205
  # Fine-tune the model
206
+ trainer.evaluate()
207
+ #if os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir):
208
+ #train_result = trainer.train(resume_from_checkpoint=True)
209
+ #else:
210
+ #train_result = trainer.train()
211
+ #trainer.push_to_hub(commit_message="Training complete!")
212
  except Exception as e:
213
  return f"An error occurred: {str(e)}, TB: {traceback.format_exc()}"
214
  return 'DONE!'#train_result