Reyad-Ahmmed commited on
Commit
58f3006
·
verified ·
1 Parent(s): bcf8364
Files changed (1) hide show
  1. app.py +51 -13
app.py CHANGED
@@ -70,14 +70,13 @@ if (should_train_model=='1'): #train model
70
  #settings
71
  model_save_path = path_to_save_trained_model_to
72
  bias_non_fleet = 1.0
73
- epochs_to_run = 25
74
 
75
  file_path_train = train_file + ".csv"
76
  file_path_test = test_file + ".csv"
77
 
78
  # Read the CSV files into pandas DataFrames they will later by converted to DataTables and used to train and evaluate the model
79
-
80
- file_train_df = fetch_and_update_training_data(file_path_train)
81
  file_test_df = pd.read_csv(file_path_test)
82
 
83
 
@@ -93,10 +92,9 @@ if (should_train_model=='1'): #train model
93
 
94
  repo_name = "Reyad-Ahmmed/hf-data-timeframe"
95
 
 
 
96
  tokenizer = BertTokenizer.from_pretrained(repo_name, subfolder="bert_embeddings_finetune")
97
-
98
- #tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
99
-
100
  # I made sure to add all the ones in the training and eval data to this list
101
  # since we are training using data that only contains the left tag - we don't need right tags added to this list
102
  new_tokens = ['<EMPLOYEE_FIRST_NAME>', '<EMPLOYEE_LAST_NAME>','<POINT_ADDRESS>', '<TRUCK_NAME>', '<POINT_CLASS_NAME>', '<POINT_NAME>', '<TRUCK_CLASS_NAME>', '<TRUCK_STATUS_NAME>]']
@@ -104,9 +102,9 @@ if (should_train_model=='1'): #train model
104
 
105
 
106
  # Model
107
- model = BertForSequenceClassification.from_pretrained(repo_name, subfolder="bert_embeddings_finetune", output_attentions=True, num_labels=len(label_mapping), output_hidden_states=True).to('cpu')
108
-
109
- #model = BertForSequenceClassification.from_pretrained("roberta-base", output_attentions=True, num_labels=len(label_mapping), output_hidden_states=True).to('cpu')
110
 
111
  # Reset tokenizer size to include the new size after adding the tags to the tokenizer's tokens
112
  model.resize_token_embeddings(len(tokenizer))
@@ -146,6 +144,8 @@ if (should_train_model=='1'): #train model
146
  emotions_dataset_train = Dataset.from_dict(emotions_dict_train)
147
  emotions_dataset_test = Dataset.from_dict(emotions_dict_test)
148
 
 
 
149
  # Step 4: Split dataset into train and validation
150
  # Create top level dictionary with both datasets (will contain two keys: one for "train" whose value is the training dataset
151
  # and one for "validation" with test dataset)
@@ -154,10 +154,12 @@ if (should_train_model=='1'): #train model
154
  'validation': emotions_dataset_test
155
  })
156
 
 
157
  # Define the tokenize function
158
  def tokenize(batch):
159
  return tokenizer(batch["text"], padding=True, truncation=True)
160
 
 
161
  # Apply tokenization by mapping the entire dataset (both training and validation) to tokenizer function
162
  # this will add the "input_id" and "attention_mask" columns
163
  emotions_encoded = emotions_encoded.map(tokenize, batched=True)
@@ -179,6 +181,7 @@ if (should_train_model=='1'): #train model
179
  accuracy = (preds == labels).astype(float).mean()
180
  return {"accuracy": accuracy}
181
 
 
182
  training_args = TrainingArguments(
183
  output_dir='./results',
184
  num_train_epochs=epochs_to_run,
@@ -192,6 +195,10 @@ if (should_train_model=='1'): #train model
192
  evaluation_strategy="epoch",
193
  )
194
 
 
 
 
 
195
  # This is needed b/c loss_fn is swapped out in order to use weighted loss
196
  # Any class weights that are not equal to one will make the model more (if greater than one) or less (if less than one)sensitive to given label
197
  class CustomTrainer(Trainer):
@@ -207,6 +214,15 @@ if (should_train_model=='1'): #train model
207
 
208
  return (loss, outputs) if return_outputs else loss
209
 
 
 
 
 
 
 
 
 
 
210
  trainer = Trainer(
211
  model=model,
212
  args=training_args,
@@ -215,6 +231,14 @@ if (should_train_model=='1'): #train model
215
  tokenizer=tokenizer
216
  )
217
 
 
 
 
 
 
 
 
 
218
  # send validation prompts through the model - will be used in error-analysis matrix below
219
  preds_output = trainer.predict(emotions_encoded["validation"])
220
 
@@ -280,7 +304,7 @@ if (should_train_model=='1'): #train model
280
 
281
  # Save the model and tokenizer
282
  model.save_pretrained(f"./{model_save_path}")
283
- tokenizer.save_pretrained(f"./{model_save_path}")
284
 
285
  #for push repository
286
  repo_name = "Reyad-Ahmmed/hf-data-timeframe"
@@ -296,15 +320,25 @@ if (should_train_model=='1'): #train model
296
  create_repo(repo_id=repo_name, token=api_token, exist_ok=True)
297
 
298
  # Upload the model and tokenizer to the Hugging Face repository
 
299
  upload_folder(
300
  folder_path=f"{model_save_path}",
301
  path_in_repo=f"{model_save_path}",
302
  repo_id=repo_name,
303
  token=api_token,
304
- commit_message="Push model and tokenizer",
 
305
  )
306
 
307
- print("Operation complete for fine-tunning.")
 
 
 
 
 
 
 
 
308
  else:
309
  print('Load Pre-trained')
310
  model_save_path = f"./{model_save_path}"
@@ -314,6 +348,10 @@ else:
314
  model = AutoModelForSequenceClassification.from_pretrained(model_save_path).to('cpu')
315
  tokenizer = AutoTokenizer.from_pretrained(tokenizer_save_path)
316
 
 
 
 
 
317
  #Function to classify user input
318
  def classify_user_input(user_input):
319
  while True:
@@ -366,4 +404,4 @@ def classify_user_input(user_input):
366
 
367
 
368
  iface = gr.Interface(fn=classify_user_input, inputs="text", outputs="text")
369
- iface.launch(share=True)
 
70
  #settings
71
  model_save_path = path_to_save_trained_model_to
72
  bias_non_fleet = 1.0
73
+ epochs_to_run = 15
74
 
75
  file_path_train = train_file + ".csv"
76
  file_path_test = test_file + ".csv"
77
 
78
  # Read the CSV files into pandas DataFrames they will later by converted to DataTables and used to train and evaluate the model
79
+ file_train_df = pd.read_csv(file_path_train)
 
80
  file_test_df = pd.read_csv(file_path_test)
81
 
82
 
 
92
 
93
  repo_name = "Reyad-Ahmmed/hf-data-timeframe"
94
 
95
+ # Tokenization - get Tokenizer for roberta-base (must match model - also roberta-base)
96
+ # tokenizer = BertTokenizer.from_pretrained('./mitra_ai_fleet_bert_tokenizer')
97
  tokenizer = BertTokenizer.from_pretrained(repo_name, subfolder="bert_embeddings_finetune")
 
 
 
98
  # I made sure to add all the ones in the training and eval data to this list
99
  # since we are training using data that only contains the left tag - we don't need right tags added to this list
100
  new_tokens = ['<EMPLOYEE_FIRST_NAME>', '<EMPLOYEE_LAST_NAME>','<POINT_ADDRESS>', '<TRUCK_NAME>', '<POINT_CLASS_NAME>', '<POINT_NAME>', '<TRUCK_CLASS_NAME>', '<TRUCK_STATUS_NAME>]']
 
102
 
103
 
104
  # Model
105
+ model = BertForSequenceClassification.from_pretrained(repo_name, subfolder="bert_embeddings_finetune", output_attentions=True, num_labels=len(label_mapping), output_hidden_states=True).to('cuda')
106
+ # model = BertForSequenceClassification.from_pretrained('./mitra_ai_fleet_bert', output_attentions=True, num_labels=len(label_mapping), output_hidden_states=True).to('cuda')
107
+
108
 
109
  # Reset tokenizer size to include the new size after adding the tags to the tokenizer's tokens
110
  model.resize_token_embeddings(len(tokenizer))
 
144
  emotions_dataset_train = Dataset.from_dict(emotions_dict_train)
145
  emotions_dataset_test = Dataset.from_dict(emotions_dict_test)
146
 
147
+
148
+
149
  # Step 4: Split dataset into train and validation
150
  # Create top level dictionary with both datasets (will contain two keys: one for "train" whose value is the training dataset
151
  # and one for "validation" with test dataset)
 
154
  'validation': emotions_dataset_test
155
  })
156
 
157
+
158
  # Define the tokenize function
159
  def tokenize(batch):
160
  return tokenizer(batch["text"], padding=True, truncation=True)
161
 
162
+
163
  # Apply tokenization by mapping the entire dataset (both training and validation) to tokenizer function
164
  # this will add the "input_id" and "attention_mask" columns
165
  emotions_encoded = emotions_encoded.map(tokenize, batched=True)
 
181
  accuracy = (preds == labels).astype(float).mean()
182
  return {"accuracy": accuracy}
183
 
184
+
185
  training_args = TrainingArguments(
186
  output_dir='./results',
187
  num_train_epochs=epochs_to_run,
 
195
  evaluation_strategy="epoch",
196
  )
197
 
198
+ # notice the bias_non_float in next line (it is given a value at top of code)
199
+ # class_weights = torch.tensor([1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,bias_non_fleet,1.0,1.0]) # Replace with your actual class weights
200
+ # class_weights = class_weights.to('cuda' if torch.cuda.is_available() else 'cpu')
201
+
202
  # This is needed b/c loss_fn is swapped out in order to use weighted loss
203
  # Any class weights that are not equal to one will make the model more (if greater than one) or less (if less than one)sensitive to given label
204
  class CustomTrainer(Trainer):
 
214
 
215
  return (loss, outputs) if return_outputs else loss
216
 
217
+
218
+ # trainer = CustomTrainer(
219
+ # model=model,
220
+ # compute_metrics=compute_metrics,
221
+ # args=training_args,
222
+ # train_dataset=emotions_encoded["train"],
223
+ # eval_dataset=emotions_encoded["validation"],
224
+ # tokenizer=tokenizer )
225
+
226
  trainer = Trainer(
227
  model=model,
228
  args=training_args,
 
231
  tokenizer=tokenizer
232
  )
233
 
234
+ # Train the model and set timer to measure the training time
235
+ start_time = time.time()
236
+ trainer.train()
237
+ end_time = time.time()
238
+ execution_time = end_time - start_time
239
+
240
+ print(f"Execution Time: {execution_time:.2f} seconds")
241
+
242
  # send validation prompts through the model - will be used in error-analysis matrix below
243
  preds_output = trainer.predict(emotions_encoded["validation"])
244
 
 
304
 
305
  # Save the model and tokenizer
306
  model.save_pretrained(f"./{model_save_path}")
307
+ tokenizer.save_pretrained('./saved_fleet_tokenizer')
308
 
309
  #for push repository
310
  repo_name = "Reyad-Ahmmed/hf-data-timeframe"
 
320
  create_repo(repo_id=repo_name, token=api_token, exist_ok=True)
321
 
322
  # Upload the model and tokenizer to the Hugging Face repository
323
+
324
  upload_folder(
325
  folder_path=f"{model_save_path}",
326
  path_in_repo=f"{model_save_path}",
327
  repo_id=repo_name,
328
  token=api_token,
329
+ commit_message="Push fleet model",
330
+ #overwrite=True # Force overwrite existing files
331
  )
332
 
333
+ upload_folder(
334
+ folder_path="saved_fleet_tokenizer",
335
+ path_in_repo="saved_fleet_tokenizer",
336
+ repo_id=repo_name,
337
+ token=api_token,
338
+ commit_message="Push fleet tokenizer",
339
+ #overwrite=True # Force overwrite existing files
340
+ )
341
+
342
  else:
343
  print('Load Pre-trained')
344
  model_save_path = f"./{model_save_path}"
 
348
  model = AutoModelForSequenceClassification.from_pretrained(model_save_path).to('cpu')
349
  tokenizer = AutoTokenizer.from_pretrained(tokenizer_save_path)
350
 
351
+ #Define the label mappings (this must match the mapping used during training)
352
+ label_mapping = model.config.label_mapping
353
+ label_mapping_reverse = {value: key for key, value in label_mapping.items()}
354
+
355
  #Function to classify user input
356
  def classify_user_input(user_input):
357
  while True:
 
404
 
405
 
406
  iface = gr.Interface(fn=classify_user_input, inputs="text", outputs="text")
407
+ iface.launch(share=True)