Kevin Fink commited on
Commit
baa8d35
·
1 Parent(s): fafbcd2
Files changed (1) hide show
  1. app.py +22 -12
app.py CHANGED
@@ -117,18 +117,28 @@ def fine_tune_model(model, dataset_name, hub_id, api_key, num_epochs, batch_size
117
  saved_dataset = load_from_disk(f'/data/{hub_id.strip()}_train_dataset')
118
  try:
119
  load_from_disk(f'/data/{hub_id.strip()}_validation_dataset')
120
- try:
121
- saved_test_dataset = load_from_disk(f'/data/{hub_id.strip()}_test_dataset')
122
- print("FOUND TEST")
123
  train_dataset = load_from_disk(f'/data/{hub_id.strip()}_train_dataset3')
124
- # Create Trainer
125
- trainer = Trainer(
126
- model=model,
127
- args=training_args,
128
- train_dataset=train_dataset,
129
- eval_dataset=saved_test_dataset,
130
- compute_metrics=compute_metrics,
131
- )
 
 
 
 
 
 
 
 
 
 
 
 
132
  except:
133
  dataset = load_dataset(dataset_name.strip())
134
  train_size = len(dataset['train'])
@@ -137,10 +147,10 @@ def fine_tune_model(model, dataset_name, hub_id, api_key, num_epochs, batch_size
137
  saved_dataset = load_from_disk(f'/data/{hub_id.strip()}_train_dataset2')
138
  third_third = dataset['train'].select(range(third_size*2, train_size))
139
  dataset['train'] = third_third
 
140
  tokenized_second_half = dataset.map(tokenize_function, batched=True)
141
  dataset['train'] = concatenate_datasets([saved_dataset['train'], tokenized_second_half['train']])
142
  dataset['train'].save_to_disk(f'/data/{hub_id.strip()}_train_dataset3')
143
- dataset['test'].save_to_disk(f'/data/{hub_id.strip()}_test_dataset')
144
  return 'THIRD THIRD LOADED'
145
  except:
146
  dataset = load_dataset(dataset_name.strip())
 
117
  saved_dataset = load_from_disk(f'/data/{hub_id.strip()}_train_dataset')
118
  try:
119
  load_from_disk(f'/data/{hub_id.strip()}_validation_dataset')
120
+ try:
 
 
121
  train_dataset = load_from_disk(f'/data/{hub_id.strip()}_train_dataset3')
122
+ try:
123
+
124
+ saved_test_dataset = load_from_disk(f'/data/{hub_id.strip()}_test_dataset')
125
+ print("FOUND TEST")
126
+ # Create Trainer
127
+ trainer = Trainer(
128
+ model=model,
129
+ args=training_args,
130
+ train_dataset=train_dataset,
131
+ eval_dataset=saved_test_dataset,
132
+ compute_metrics=compute_metrics,
133
+ )
134
+ except:
135
+ if len(dataset['train']) == len(train_dataset['train']):
136
+ dataset = load_dataset(dataset_name.strip())
137
+ del dataset['train']
138
+ del dataset['validation']
139
+ test_set = dataset.map(tokenize_function, batched=True)
140
+ test_set['test'].save_to_disk(f'/data/{hub_id.strip()}_test_dataset')
141
+ return 'TRAINING DONE'
142
  except:
143
  dataset = load_dataset(dataset_name.strip())
144
  train_size = len(dataset['train'])
 
147
  saved_dataset = load_from_disk(f'/data/{hub_id.strip()}_train_dataset2')
148
  third_third = dataset['train'].select(range(third_size*2, train_size))
149
  dataset['train'] = third_third
150
+ del dataset['test']
151
  tokenized_second_half = dataset.map(tokenize_function, batched=True)
152
  dataset['train'] = concatenate_datasets([saved_dataset['train'], tokenized_second_half['train']])
153
  dataset['train'].save_to_disk(f'/data/{hub_id.strip()}_train_dataset3')
 
154
  return 'THIRD THIRD LOADED'
155
  except:
156
  dataset = load_dataset(dataset_name.strip())