Kevin Fink commited on
Commit
3b756d7
·
1 Parent(s): 069a9a6
Files changed (1) hide show
  1. app.py +10 -10
app.py CHANGED
@@ -112,22 +112,22 @@ def fine_tune_model(model, dataset_name, hub_id, api_key, num_epochs, batch_size
112
  # Load the dataset
113
  dataset = load_dataset(dataset_name.strip())
114
  train_size = len(dataset['train'])
115
- half_size = train_size // 2
116
  max_length = model.get_input_embeddings().weight.shape[0]
117
  try:
118
- tokenized_first_half = load_from_disk(f'/data/{hub_id.strip()}_train_dataset')
119
- if 'test' in tokenized_first_half.keys():
120
- second_half_second_quarter = dataset['train'].select(range(half_size+half_size//2, train_size))
121
- dataset['train'] = second_half_second_quarter
122
  tokenized_second_half = dataset.map(tokenize_function, batched=True)
123
- dataset['train'] = concatenate_datasets([tokenized_first_half['train'], tokenized_second_half['train']])
124
  tokenized_train_dataset = dataset['train']
125
  tokenized_test_dataset = dataset['test']
126
  else:
127
- second_half_first_quarter = dataset['train'].select(range(half_size, half_size+half_size//2))
128
- dataset['train'] = second_half_first_quarter
129
  tokenized_sh_fq_dataset = tokenize_function(dataset, batched=True)
130
- dataset['train'] = concatenate_datasets([tokenized_first_half['train'], tokenized_sh_fq_dataset['train']])
131
  tokenized_half.save_to_disk(f'/data/{hub_id.strip()}_train_dataset')
132
  return
133
 
@@ -142,7 +142,7 @@ def fine_tune_model(model, dataset_name, hub_id, api_key, num_epochs, batch_size
142
  except:
143
  tokenizer = AutoTokenizer.from_pretrained('google/t5-efficient-tiny-nh8')
144
  # Tokenize the dataset
145
- first_half = dataset['train'].select(range(half_size))
146
  dataset['train'] = first_half
147
  del dataset['test']
148
  del dataset['validation']
 
112
  # Load the dataset
113
  dataset = load_dataset(dataset_name.strip())
114
  train_size = len(dataset['train'])
115
+ third_size = train_size // 3
116
  max_length = model.get_input_embeddings().weight.shape[0]
117
  try:
118
+ saved_dataset = load_from_disk(f'/data/{hub_id.strip()}_train_dataset')
119
+ if 'test' in saved_dataset.keys():
120
+ third_third = dataset['train'].select(range(third_size*2, train_size))
121
+ dataset['train'] = third_third
122
  tokenized_second_half = dataset.map(tokenize_function, batched=True)
123
+ dataset['train'] = concatenate_datasets([saved_dataset['train'], tokenized_second_half['train']])
124
  tokenized_train_dataset = dataset['train']
125
  tokenized_test_dataset = dataset['test']
126
  else:
127
+ second_third = dataset['train'].select(range(third_size, third_size*2))
128
+ dataset['train'] = second_third
129
  tokenized_sh_fq_dataset = tokenize_function(dataset, batched=True)
130
+ dataset['train'] = concatenate_datasets([saved_dataset['train'], tokenized_sh_fq_dataset['train']])
131
  tokenized_half.save_to_disk(f'/data/{hub_id.strip()}_train_dataset')
132
  return
133
 
 
142
  except:
143
  tokenizer = AutoTokenizer.from_pretrained('google/t5-efficient-tiny-nh8')
144
  # Tokenize the dataset
145
+ first_half = dataset['train'].select(range(third_size))
146
  dataset['train'] = first_half
147
  del dataset['test']
148
  del dataset['validation']