Kevin Fink commited on
Commit
9613a2c
·
1 Parent(s): 97a2943
Files changed (1) hide show
  1. app.py +7 -1
app.py CHANGED
@@ -131,6 +131,8 @@ def fine_tune_model(model, dataset_name, hub_id, api_key, num_epochs, batch_size
131
 
132
  elif os.access(f'/data/{hub_id.strip()}_train_dataset3', os.R_OK):
133
  dataset = load_dataset(dataset_name.strip())
 
 
134
  del dataset['train']
135
  del dataset['validation']
136
  test_set = dataset.map(tokenize_function, batched=True)
@@ -139,6 +141,7 @@ def fine_tune_model(model, dataset_name, hub_id, api_key, num_epochs, batch_size
139
 
140
  elif os.access(f'/data/{hub_id.strip()}_validation_dataset', os.R_OK):
141
  dataset = load_dataset(dataset_name.strip())
 
142
  train_size = len(dataset['train'])
143
  third_size = train_size // 3
144
  del dataset['test']
@@ -156,6 +159,8 @@ def fine_tune_model(model, dataset_name, hub_id, api_key, num_epochs, batch_size
156
 
157
  if os.access(f'/data/{hub_id.strip()}_train_dataset', os.R_OK) and not os.access(f'/data/{hub_id.strip()}_train_dataset3', os.R_OK):
158
  dataset = load_dataset(dataset_name.strip())
 
 
159
  train_size = len(dataset['train'])
160
  third_size = train_size // 3
161
  second_third = dataset['train'].select(range(third_size, third_size*2))
@@ -170,6 +175,7 @@ def fine_tune_model(model, dataset_name, hub_id, api_key, num_epochs, batch_size
170
  except Exception as e:
171
  print(f"An error occurred: {str(e)}, TB: {traceback.format_exc()}")
172
  dataset = load_dataset(dataset_name.strip())
 
173
  train_size = len(dataset['train'])
174
  third_size = train_size // 3
175
  # Tokenize the dataset
@@ -177,7 +183,7 @@ def fine_tune_model(model, dataset_name, hub_id, api_key, num_epochs, batch_size
177
  dataset['train'] = first_third
178
  del dataset['test']
179
  del dataset['validation']
180
- tokenized_first_third = dataset.map(tokenize_function, batched=True, batch_size=80)
181
 
182
  tokenized_first_third.save_to_disk(f'/data/{hub_id.strip()}_train_dataset')
183
  print('DONE')
 
131
 
132
  elif os.access(f'/data/{hub_id.strip()}_train_dataset3', os.R_OK):
133
  dataset = load_dataset(dataset_name.strip())
134
+ dataset['test'] = dataset['test'].select(range(1200))
135
+ dataset['train'] = dataset['train'].select(range(12000))
136
  del dataset['train']
137
  del dataset['validation']
138
  test_set = dataset.map(tokenize_function, batched=True)
 
141
 
142
  elif os.access(f'/data/{hub_id.strip()}_validation_dataset', os.R_OK):
143
  dataset = load_dataset(dataset_name.strip())
144
+ dataset['train'] = dataset['train'].select(range(12000))
145
  train_size = len(dataset['train'])
146
  third_size = train_size // 3
147
  del dataset['test']
 
159
 
160
  if os.access(f'/data/{hub_id.strip()}_train_dataset', os.R_OK) and not os.access(f'/data/{hub_id.strip()}_train_dataset3', os.R_OK):
161
  dataset = load_dataset(dataset_name.strip())
162
+ dataset['train'] = dataset['train'].select(range(12000))
163
+ dataset['validation'] = dataset['validation'].select(range(200))
164
  train_size = len(dataset['train'])
165
  third_size = train_size // 3
166
  second_third = dataset['train'].select(range(third_size, third_size*2))
 
175
  except Exception as e:
176
  print(f"An error occurred: {str(e)}, TB: {traceback.format_exc()}")
177
  dataset = load_dataset(dataset_name.strip())
178
+ dataset['train'] = dataset['train'].select(range(12000))
179
  train_size = len(dataset['train'])
180
  third_size = train_size // 3
181
  # Tokenize the dataset
 
183
  dataset['train'] = first_third
184
  del dataset['test']
185
  del dataset['validation']
186
+ tokenized_first_third = dataset.map(tokenize_function, batched=True)
187
 
188
  tokenized_first_third.save_to_disk(f'/data/{hub_id.strip()}_train_dataset')
189
  print('DONE')