Kevin Fink commited on
Commit
69cfd5f
·
1 Parent(s): 93bdf45
Files changed (1) hide show
  1. app.py +1 -2
app.py CHANGED
@@ -111,7 +111,6 @@ def fine_tune_model(model, dataset_name, hub_id, api_key, num_epochs, batch_size
111
  #max_length = 512
112
  # Load the dataset
113
  dataset = load_dataset(dataset_name.strip())
114
- print(dataset.keys())
115
  train_size = len(dataset['train'])
116
  third_size = train_size // 3
117
  max_length = model.get_input_embeddings().weight.shape[0]
@@ -139,7 +138,7 @@ def fine_tune_model(model, dataset_name, hub_id, api_key, num_epochs, batch_size
139
  second_third = dataset['train'].select(range(third_size, third_size*2))
140
  dataset['train'] = second_third
141
  del dataset['test']
142
- tokenized_sh_fq_dataset = tokenize_function(dataset, batched=True)
143
  dataset['train'] = concatenate_datasets([saved_dataset['train'], tokenized_sh_fq_dataset['train']])
144
  dataset.save_to_disk(f'/data/{hub_id.strip()}_train_dataset')
145
  return
 
111
  #max_length = 512
112
  # Load the dataset
113
  dataset = load_dataset(dataset_name.strip())
 
114
  train_size = len(dataset['train'])
115
  third_size = train_size // 3
116
  max_length = model.get_input_embeddings().weight.shape[0]
 
138
  second_third = dataset['train'].select(range(third_size, third_size*2))
139
  dataset['train'] = second_third
140
  del dataset['test']
141
+ tokenized_sh_fq_dataset = dataset.map(tokenize_function, batched=True)
142
  dataset['train'] = concatenate_datasets([saved_dataset['train'], tokenized_sh_fq_dataset['train']])
143
  dataset.save_to_disk(f'/data/{hub_id.strip()}_train_dataset')
144
  return