Kevin Fink commited on
Commit
4dafb88
·
1 Parent(s): ab2f056
Files changed (1) hide show
  1. app.py +7 -3
app.py CHANGED
@@ -117,7 +117,8 @@ def fine_tune_model(model, dataset_name, hub_id, api_key, num_epochs, batch_size
117
  try:
118
  tokenized_first_half = load_from_disk(f'/data/{hub_id.strip()}_train_dataset')
119
  second_half = dataset['train'].select(range(half_size, train_size))
120
- tokenized_second_half = tokenize_function(second_half.to_dict())
 
121
  tokenized_train_dataset = concatenate_datasets([tokenized_first_half, tokenized_second_half])
122
  tokenized_test_dataset = tokenize_function(dataset['test'])
123
 
@@ -133,6 +134,10 @@ def fine_tune_model(model, dataset_name, hub_id, api_key, num_epochs, batch_size
133
  tokenizer = AutoTokenizer.from_pretrained('google/t5-efficient-tiny-nh8')
134
  # Tokenize the dataset
135
  first_half = dataset['train'].select(range(half_size))
 
 
 
 
136
  tokenized_half = tokenize_function(first_half.to_dict())
137
 
138
  tokenized_half.save_to_disk(f'/data/{hub_id.strip()}_train_dataset')
@@ -211,5 +216,4 @@ try:
211
  # Launch the interface
212
  iface.launch()
213
  except Exception as e:
214
- print(f"An error occurred: {str(e)}, TB: {traceback.format_exc()}")
215
-
 
117
  try:
118
  tokenized_first_half = load_from_disk(f'/data/{hub_id.strip()}_train_dataset')
119
  second_half = dataset['train'].select(range(half_size, train_size))
120
+ dataset['train'] = second_half
121
+ tokenized_second_half = dataset.map(tokenize_function, batched=True)
122
  tokenized_train_dataset = concatenate_datasets([tokenized_first_half, tokenized_second_half])
123
  tokenized_test_dataset = tokenize_function(dataset['test'])
124
 
 
134
  tokenizer = AutoTokenizer.from_pretrained('google/t5-efficient-tiny-nh8')
135
  # Tokenize the dataset
136
  first_half = dataset['train'].select(range(half_size))
137
+ dataset['train'] = first_half
138
+ del dataset['test']
139
+ del dataset['validation']
140
+ tokenized_second_half = dataset.map(tokenize_function, batched=True)
141
  tokenized_half = tokenize_function(first_half.to_dict())
142
 
143
  tokenized_half.save_to_disk(f'/data/{hub_id.strip()}_train_dataset')
 
216
  # Launch the interface
217
  iface.launch()
218
  except Exception as e:
219
+ print(f"An error occurred: {str(e)}, TB: {traceback.format_exc()}")