Kevin Fink commited on
Commit
f4fd08e
·
1 Parent(s): 6527df5
Files changed (1) hide show
  1. app.py +39 -46
app.py CHANGED
@@ -83,11 +83,41 @@ def fine_tune_model(model, dataset_name, hub_id, api_key, num_epochs, batch_size
83
  print("Loading model from checkpoint...")
84
  model = AutoModelForSeq2SeqLM.from_pretrained(training_args.output_dir)
85
 
86
- max_length = 512
87
- #max_length = model.get_input_embeddings().weight.shape[0]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
  try:
89
- tokenized_train_dataset = load_from_disk(f'/data/{hub_id.strip()}_train_dataset')
90
- tokenized_test_dataset = load_from_disk(f'/data/{hub_id.strip()}_test_dataset')
 
 
 
91
 
92
  # Create Trainer
93
  trainer = Trainer(
@@ -99,54 +129,17 @@ def fine_tune_model(model, dataset_name, hub_id, api_key, num_epochs, batch_size
99
  )
100
  except:
101
  # Load the dataset
102
- dataset = load_dataset(dataset_name.strip())
103
  tokenizer = AutoTokenizer.from_pretrained('google/t5-efficient-tiny-nh8')
104
  # Tokenize the dataset
105
- def tokenize_function(examples):
106
-
107
- # Assuming 'text' is the input and 'target' is the expected output
108
- model_inputs = tokenizer(
109
- examples['text'],
110
- max_length=max_length, # Set to None for dynamic padding
111
- truncation=True,
112
- padding='max_length',
113
- return_tensors='pt',
114
- )
115
-
116
- # Setup the decoder input IDs (shifted right)
117
- labels = tokenizer(
118
- examples['target'],
119
- max_length=max_length, # Set to None for dynamic padding
120
- truncation=True,
121
- padding='max_length',
122
- text_target=examples['target'],
123
- return_tensors='pt',
124
- )
125
 
126
- # Add labels to the model inputs
127
- model_inputs["labels"] = labels["input_ids"]
128
- return model_inputs
129
 
130
- tokenized_datasets = dataset.map(tokenize_function, batched=True)
 
131
 
132
- tokenized_datasets['train'].save_to_disk(f'/data/{hub_id.strip()}_train_dataset')
133
- tokenized_datasets['test'].save_to_disk(f'/data/{hub_id.strip()}_test_dataset')
134
-
135
- embedding_size = model.get_input_embeddings().weight.shape[0]
136
-
137
- if len(tokenizer) > embedding_size:
138
- model.resize_token_embeddings(len(tokenizer))
139
- model.resize_position_embeddings(len(tokenizer))
140
 
141
- # Create Trainer
142
- trainer = Trainer(
143
- model=model,
144
- args=training_args,
145
- train_dataset=tokenized_datasets['train'],
146
- eval_dataset=tokenized_datasets['test'],
147
- compute_metrics=compute_metrics,
148
- #callbacks=[LoggingCallback()],
149
- )
150
 
151
  # Fine-tune the model
152
  if os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir):
 
83
  print("Loading model from checkpoint...")
84
  model = AutoModelForSeq2SeqLM.from_pretrained(training_args.output_dir)
85
 
86
+ def tokenize_function(examples):
87
+
88
+ # Assuming 'text' is the input and 'target' is the expected output
89
+ model_inputs = tokenizer(
90
+ examples['text'],
91
+ max_length=max_length, # Set to None for dynamic padding
92
+ truncation=True,
93
+ padding='max_length',
94
+ return_tensors='pt',
95
+ )
96
+
97
+ # Setup the decoder input IDs (shifted right)
98
+ labels = tokenizer(
99
+ examples['target'],
100
+ max_length=max_length, # Set to None for dynamic padding
101
+ truncation=True,
102
+ padding='max_length',
103
+ text_target=examples['target'],
104
+ return_tensors='pt',
105
+ )
106
+
107
+ # Add labels to the model inputs
108
+ model_inputs["labels"] = labels["input_ids"]
109
+ return model_inputs
110
+
111
+ #max_length = 512
112
+ train_size = len(dataset['train'])
113
+ half_size = train_size // 2
114
+ max_length = model.get_input_embeddings().weight.shape[0]
115
  try:
116
+ tokenized_first_half = load_from_disk(f'/data/{hub_id.strip()}_train_dataset')
117
+ second_half = dataset['train'].select(range(half_size, train_size))
118
+ tokenized_second_half = tokenize_function(second_half)
119
+ tokenized_train_dataset = concatenate_datasets([tokenized_first_half, tokenized_second_half])
120
+ tokenized_test_dataset = tokenize_function(dataset['test'])
121
 
122
  # Create Trainer
123
  trainer = Trainer(
 
129
  )
130
  except:
131
  # Load the dataset
132
+ dataset = load_dataset(dataset_name.strip())
133
  tokenizer = AutoTokenizer.from_pretrained('google/t5-efficient-tiny-nh8')
134
  # Tokenize the dataset
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
135
 
 
 
 
136
 
137
+ first_half = dataset['train'].select(range(half_size))
138
+ tokenized_half = tokenize_function(first_half)
139
 
140
+ tokenized_half.save_to_disk(f'/data/{hub_id.strip()}_train_dataset')
 
 
 
 
 
 
 
141
 
142
+ return 'RUN AGAIN TO LOAD REST OF DATA'
 
 
 
 
 
 
 
 
143
 
144
  # Fine-tune the model
145
  if os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir):