Spaces:
Runtime error
Runtime error
Update app.py (#1)
Browse files- Update app.py (a40bfe28586cba79a1d3b0dec7c640aaf023dffa)
app.py
CHANGED
@@ -70,14 +70,13 @@ if (should_train_model=='1'): #train model
|
|
70 |
#settings
|
71 |
model_save_path = path_to_save_trained_model_to
|
72 |
bias_non_fleet = 1.0
|
73 |
-
epochs_to_run =
|
74 |
|
75 |
file_path_train = train_file + ".csv"
|
76 |
file_path_test = test_file + ".csv"
|
77 |
|
78 |
# Read the CSV files into pandas DataFrames they will later by converted to DataTables and used to train and evaluate the model
|
79 |
-
|
80 |
-
file_train_df = fetch_and_update_training_data(file_path_train)
|
81 |
file_test_df = pd.read_csv(file_path_test)
|
82 |
|
83 |
|
@@ -93,10 +92,9 @@ if (should_train_model=='1'): #train model
|
|
93 |
|
94 |
repo_name = "Reyad-Ahmmed/hf-data-timeframe"
|
95 |
|
|
|
|
|
96 |
tokenizer = BertTokenizer.from_pretrained(repo_name, subfolder="bert_embeddings_finetune")
|
97 |
-
|
98 |
-
#tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
|
99 |
-
|
100 |
# I made sure to add all the ones in the training and eval data to this list
|
101 |
# since we are training using data that only contains the left tag - we don't need right tags added to this list
|
102 |
new_tokens = ['<EMPLOYEE_FIRST_NAME>', '<EMPLOYEE_LAST_NAME>','<POINT_ADDRESS>', '<TRUCK_NAME>', '<POINT_CLASS_NAME>', '<POINT_NAME>', '<TRUCK_CLASS_NAME>', '<TRUCK_STATUS_NAME>]']
|
@@ -104,9 +102,9 @@ if (should_train_model=='1'): #train model
|
|
104 |
|
105 |
|
106 |
# Model
|
107 |
-
model = BertForSequenceClassification.from_pretrained(repo_name, subfolder="bert_embeddings_finetune", output_attentions=True, num_labels=len(label_mapping), output_hidden_states=True).to('
|
108 |
-
|
109 |
-
|
110 |
|
111 |
# Reset tokenizer size to include the new size after adding the tags to the tokenizer's tokens
|
112 |
model.resize_token_embeddings(len(tokenizer))
|
@@ -146,6 +144,8 @@ if (should_train_model=='1'): #train model
|
|
146 |
emotions_dataset_train = Dataset.from_dict(emotions_dict_train)
|
147 |
emotions_dataset_test = Dataset.from_dict(emotions_dict_test)
|
148 |
|
|
|
|
|
149 |
# Step 4: Split dataset into train and validation
|
150 |
# Create top level dictionary with both datasets (will contain two keys: one for "train" whose value is the training dataset
|
151 |
# and one for "validation" with test dataset)
|
@@ -154,10 +154,12 @@ if (should_train_model=='1'): #train model
|
|
154 |
'validation': emotions_dataset_test
|
155 |
})
|
156 |
|
|
|
157 |
# Define the tokenize function
|
158 |
def tokenize(batch):
|
159 |
return tokenizer(batch["text"], padding=True, truncation=True)
|
160 |
|
|
|
161 |
# Apply tokenization by mapping the entire dataset (both training and validation) to tokenizer function
|
162 |
# this will add the "input_id" and "attention_mask" columns
|
163 |
emotions_encoded = emotions_encoded.map(tokenize, batched=True)
|
@@ -179,6 +181,7 @@ if (should_train_model=='1'): #train model
|
|
179 |
accuracy = (preds == labels).astype(float).mean()
|
180 |
return {"accuracy": accuracy}
|
181 |
|
|
|
182 |
training_args = TrainingArguments(
|
183 |
output_dir='./results',
|
184 |
num_train_epochs=epochs_to_run,
|
@@ -192,6 +195,10 @@ if (should_train_model=='1'): #train model
|
|
192 |
evaluation_strategy="epoch",
|
193 |
)
|
194 |
|
|
|
|
|
|
|
|
|
195 |
# This is needed b/c loss_fn is swapped out in order to use weighted loss
|
196 |
# Any class weights that are not equal to one will make the model more (if greater than one) or less (if less than one)sensitive to given label
|
197 |
class CustomTrainer(Trainer):
|
@@ -207,6 +214,15 @@ if (should_train_model=='1'): #train model
|
|
207 |
|
208 |
return (loss, outputs) if return_outputs else loss
|
209 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
210 |
trainer = Trainer(
|
211 |
model=model,
|
212 |
args=training_args,
|
@@ -215,6 +231,14 @@ if (should_train_model=='1'): #train model
|
|
215 |
tokenizer=tokenizer
|
216 |
)
|
217 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
218 |
# send validation prompts through the model - will be used in error-analysis matrix below
|
219 |
preds_output = trainer.predict(emotions_encoded["validation"])
|
220 |
|
@@ -280,7 +304,7 @@ if (should_train_model=='1'): #train model
|
|
280 |
|
281 |
# Save the model and tokenizer
|
282 |
model.save_pretrained(f"./{model_save_path}")
|
283 |
-
tokenizer.save_pretrained(
|
284 |
|
285 |
#for push repository
|
286 |
repo_name = "Reyad-Ahmmed/hf-data-timeframe"
|
@@ -296,15 +320,25 @@ if (should_train_model=='1'): #train model
|
|
296 |
create_repo(repo_id=repo_name, token=api_token, exist_ok=True)
|
297 |
|
298 |
# Upload the model and tokenizer to the Hugging Face repository
|
|
|
299 |
upload_folder(
|
300 |
folder_path=f"{model_save_path}",
|
301 |
path_in_repo=f"{model_save_path}",
|
302 |
repo_id=repo_name,
|
303 |
token=api_token,
|
304 |
-
commit_message="Push model
|
|
|
305 |
)
|
306 |
|
307 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
308 |
else:
|
309 |
print('Load Pre-trained')
|
310 |
model_save_path = f"./{model_save_path}"
|
@@ -314,6 +348,10 @@ else:
|
|
314 |
model = AutoModelForSequenceClassification.from_pretrained(model_save_path).to('cpu')
|
315 |
tokenizer = AutoTokenizer.from_pretrained(tokenizer_save_path)
|
316 |
|
|
|
|
|
|
|
|
|
317 |
#Function to classify user input
|
318 |
def classify_user_input(user_input):
|
319 |
while True:
|
@@ -366,4 +404,4 @@ def classify_user_input(user_input):
|
|
366 |
|
367 |
|
368 |
iface = gr.Interface(fn=classify_user_input, inputs="text", outputs="text")
|
369 |
-
iface.launch(share=True)
|
|
|
70 |
#settings
|
71 |
model_save_path = path_to_save_trained_model_to
|
72 |
bias_non_fleet = 1.0
|
73 |
+
epochs_to_run = 15
|
74 |
|
75 |
file_path_train = train_file + ".csv"
|
76 |
file_path_test = test_file + ".csv"
|
77 |
|
78 |
# Read the CSV files into pandas DataFrames they will later by converted to DataTables and used to train and evaluate the model
|
79 |
+
file_train_df = pd.read_csv(file_path_train)
|
|
|
80 |
file_test_df = pd.read_csv(file_path_test)
|
81 |
|
82 |
|
|
|
92 |
|
93 |
repo_name = "Reyad-Ahmmed/hf-data-timeframe"
|
94 |
|
95 |
+
# Tokenization - get Tokenizer for roberta-base (must match model - also roberta-base)
|
96 |
+
# tokenizer = BertTokenizer.from_pretrained('./mitra_ai_fleet_bert_tokenizer')
|
97 |
tokenizer = BertTokenizer.from_pretrained(repo_name, subfolder="bert_embeddings_finetune")
|
|
|
|
|
|
|
98 |
# I made sure to add all the ones in the training and eval data to this list
|
99 |
# since we are training using data that only contains the left tag - we don't need right tags added to this list
|
100 |
new_tokens = ['<EMPLOYEE_FIRST_NAME>', '<EMPLOYEE_LAST_NAME>','<POINT_ADDRESS>', '<TRUCK_NAME>', '<POINT_CLASS_NAME>', '<POINT_NAME>', '<TRUCK_CLASS_NAME>', '<TRUCK_STATUS_NAME>]']
|
|
|
102 |
|
103 |
|
104 |
# Model
|
105 |
+
model = BertForSequenceClassification.from_pretrained(repo_name, subfolder="bert_embeddings_finetune", output_attentions=True, num_labels=len(label_mapping), output_hidden_states=True).to('cuda')
|
106 |
+
# model = BertForSequenceClassification.from_pretrained('./mitra_ai_fleet_bert', output_attentions=True, num_labels=len(label_mapping), output_hidden_states=True).to('cuda')
|
107 |
+
|
108 |
|
109 |
# Reset tokenizer size to include the new size after adding the tags to the tokenizer's tokens
|
110 |
model.resize_token_embeddings(len(tokenizer))
|
|
|
144 |
emotions_dataset_train = Dataset.from_dict(emotions_dict_train)
|
145 |
emotions_dataset_test = Dataset.from_dict(emotions_dict_test)
|
146 |
|
147 |
+
|
148 |
+
|
149 |
# Step 4: Split dataset into train and validation
|
150 |
# Create top level dictionary with both datasets (will contain two keys: one for "train" whose value is the training dataset
|
151 |
# and one for "validation" with test dataset)
|
|
|
154 |
'validation': emotions_dataset_test
|
155 |
})
|
156 |
|
157 |
+
|
158 |
# Define the tokenize function
|
159 |
def tokenize(batch):
|
160 |
return tokenizer(batch["text"], padding=True, truncation=True)
|
161 |
|
162 |
+
|
163 |
# Apply tokenization by mapping the entire dataset (both training and validation) to tokenizer function
|
164 |
# this will add the "input_id" and "attention_mask" columns
|
165 |
emotions_encoded = emotions_encoded.map(tokenize, batched=True)
|
|
|
181 |
accuracy = (preds == labels).astype(float).mean()
|
182 |
return {"accuracy": accuracy}
|
183 |
|
184 |
+
|
185 |
training_args = TrainingArguments(
|
186 |
output_dir='./results',
|
187 |
num_train_epochs=epochs_to_run,
|
|
|
195 |
evaluation_strategy="epoch",
|
196 |
)
|
197 |
|
198 |
+
# notice the bias_non_float in next line (it is given a value at top of code)
|
199 |
+
# class_weights = torch.tensor([1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,bias_non_fleet,1.0,1.0]) # Replace with your actual class weights
|
200 |
+
# class_weights = class_weights.to('cuda' if torch.cuda.is_available() else 'cpu')
|
201 |
+
|
202 |
# This is needed b/c loss_fn is swapped out in order to use weighted loss
|
203 |
# Any class weights that are not equal to one will make the model more (if greater than one) or less (if less than one)sensitive to given label
|
204 |
class CustomTrainer(Trainer):
|
|
|
214 |
|
215 |
return (loss, outputs) if return_outputs else loss
|
216 |
|
217 |
+
|
218 |
+
# trainer = CustomTrainer(
|
219 |
+
# model=model,
|
220 |
+
# compute_metrics=compute_metrics,
|
221 |
+
# args=training_args,
|
222 |
+
# train_dataset=emotions_encoded["train"],
|
223 |
+
# eval_dataset=emotions_encoded["validation"],
|
224 |
+
# tokenizer=tokenizer )
|
225 |
+
|
226 |
trainer = Trainer(
|
227 |
model=model,
|
228 |
args=training_args,
|
|
|
231 |
tokenizer=tokenizer
|
232 |
)
|
233 |
|
234 |
+
# Train the model and set timer to measure the training time
|
235 |
+
start_time = time.time()
|
236 |
+
trainer.train()
|
237 |
+
end_time = time.time()
|
238 |
+
execution_time = end_time - start_time
|
239 |
+
|
240 |
+
print(f"Execution Time: {execution_time:.2f} seconds")
|
241 |
+
|
242 |
# send validation prompts through the model - will be used in error-analysis matrix below
|
243 |
preds_output = trainer.predict(emotions_encoded["validation"])
|
244 |
|
|
|
304 |
|
305 |
# Save the model and tokenizer
|
306 |
model.save_pretrained(f"./{model_save_path}")
|
307 |
+
tokenizer.save_pretrained('./saved_fleet_tokenizer')
|
308 |
|
309 |
#for push repository
|
310 |
repo_name = "Reyad-Ahmmed/hf-data-timeframe"
|
|
|
320 |
create_repo(repo_id=repo_name, token=api_token, exist_ok=True)
|
321 |
|
322 |
# Upload the model and tokenizer to the Hugging Face repository
|
323 |
+
|
324 |
upload_folder(
|
325 |
folder_path=f"{model_save_path}",
|
326 |
path_in_repo=f"{model_save_path}",
|
327 |
repo_id=repo_name,
|
328 |
token=api_token,
|
329 |
+
commit_message="Push fleet model",
|
330 |
+
#overwrite=True # Force overwrite existing files
|
331 |
)
|
332 |
|
333 |
+
upload_folder(
|
334 |
+
folder_path="saved_fleet_tokenizer",
|
335 |
+
path_in_repo="saved_fleet_tokenizer",
|
336 |
+
repo_id=repo_name,
|
337 |
+
token=api_token,
|
338 |
+
commit_message="Push fleet tokenizer",
|
339 |
+
#overwrite=True # Force overwrite existing files
|
340 |
+
)
|
341 |
+
|
342 |
else:
|
343 |
print('Load Pre-trained')
|
344 |
model_save_path = f"./{model_save_path}"
|
|
|
348 |
model = AutoModelForSequenceClassification.from_pretrained(model_save_path).to('cpu')
|
349 |
tokenizer = AutoTokenizer.from_pretrained(tokenizer_save_path)
|
350 |
|
351 |
+
#Define the label mappings (this must match the mapping used during training)
|
352 |
+
label_mapping = model.config.label_mapping
|
353 |
+
label_mapping_reverse = {value: key for key, value in label_mapping.items()}
|
354 |
+
|
355 |
#Function to classify user input
|
356 |
def classify_user_input(user_input):
|
357 |
while True:
|
|
|
404 |
|
405 |
|
406 |
iface = gr.Interface(fn=classify_user_input, inputs="text", outputs="text")
|
407 |
+
iface.launch(share=True)
|