Spaces:

Reyad-Ahmmed
/

HF_Python

Sleeping

App Files Files Community

Reyad-Ahmmed commited on Apr 6

Commit

58f3006

verified ·

1 Parent(s): bcf8364

Update app.py (#1)

Browse files

- Update app.py (a40bfe28586cba79a1d3b0dec7c640aaf023dffa)

Files changed (1) hide show

app.py +51 -13

app.py CHANGED Viewed

@@ -70,14 +70,13 @@ if (should_train_model=='1'): #train model
     #settings
     model_save_path = path_to_save_trained_model_to
     bias_non_fleet = 1.0
-    epochs_to_run = 25
     file_path_train = train_file + ".csv"
     file_path_test = test_file + ".csv"
     # Read the CSV files into pandas DataFrames they will later by converted to DataTables and used to train and evaluate the model
-    file_train_df = fetch_and_update_training_data(file_path_train)
     file_test_df = pd.read_csv(file_path_test)
@@ -93,10 +92,9 @@ if (should_train_model=='1'): #train model
     repo_name = "Reyad-Ahmmed/hf-data-timeframe"
     tokenizer = BertTokenizer.from_pretrained(repo_name, subfolder="bert_embeddings_finetune")
-    #tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
     # I made sure to add all the ones in the training and eval data to this list
     # since we are training using data that only contains the left tag - we don't need right tags added to this list
     new_tokens = ['<EMPLOYEE_FIRST_NAME>', '<EMPLOYEE_LAST_NAME>','<POINT_ADDRESS>', '<TRUCK_NAME>', '<POINT_CLASS_NAME>', '<POINT_NAME>', '<TRUCK_CLASS_NAME>', '<TRUCK_STATUS_NAME>]']
@@ -104,9 +102,9 @@ if (should_train_model=='1'): #train model
     # Model
-    model = BertForSequenceClassification.from_pretrained(repo_name, subfolder="bert_embeddings_finetune", output_attentions=True, num_labels=len(label_mapping), output_hidden_states=True).to('cpu')
-    #model = BertForSequenceClassification.from_pretrained("roberta-base", output_attentions=True, num_labels=len(label_mapping), output_hidden_states=True).to('cpu')
     # Reset tokenizer size to include the new size after adding the tags to the tokenizer's tokens
     model.resize_token_embeddings(len(tokenizer))
@@ -146,6 +144,8 @@ if (should_train_model=='1'): #train model
     emotions_dataset_train = Dataset.from_dict(emotions_dict_train)
     emotions_dataset_test = Dataset.from_dict(emotions_dict_test)
     # Step 4: Split dataset into train and validation
     # Create top level dictionary with both datasets (will contain two keys: one for "train" whose value is the training dataset
     # and one for "validation" with test dataset)
@@ -154,10 +154,12 @@ if (should_train_model=='1'): #train model
         'validation': emotions_dataset_test
     })
     # Define the tokenize function
     def tokenize(batch):
         return tokenizer(batch["text"], padding=True, truncation=True)
     # Apply tokenization by mapping the entire dataset (both training and validation) to tokenizer function
     # this will add the "input_id" and "attention_mask" columns
     emotions_encoded = emotions_encoded.map(tokenize, batched=True)
@@ -179,6 +181,7 @@ if (should_train_model=='1'): #train model
         accuracy = (preds == labels).astype(float).mean()
         return {"accuracy": accuracy}
     training_args = TrainingArguments(
         output_dir='./results',
         num_train_epochs=epochs_to_run,
@@ -192,6 +195,10 @@ if (should_train_model=='1'): #train model
         evaluation_strategy="epoch",
     )
     # This is needed b/c loss_fn is swapped out in order to use weighted loss
     # Any class weights that are not equal to one will make the model more (if greater than one) or less (if less than one)sensitive to given label
     class CustomTrainer(Trainer):
@@ -207,6 +214,15 @@ if (should_train_model=='1'): #train model
             return (loss, outputs) if return_outputs else loss
     trainer = Trainer(
         model=model,
         args=training_args,
@@ -215,6 +231,14 @@ if (should_train_model=='1'): #train model
         tokenizer=tokenizer
     )
     # send validation prompts through the model - will be used in error-analysis matrix below
     preds_output = trainer.predict(emotions_encoded["validation"])
@@ -280,7 +304,7 @@ if (should_train_model=='1'): #train model
     # Save the model and tokenizer
     model.save_pretrained(f"./{model_save_path}")
-    tokenizer.save_pretrained(f"./{model_save_path}")
     #for push repository
     repo_name = "Reyad-Ahmmed/hf-data-timeframe"
@@ -296,15 +320,25 @@ if (should_train_model=='1'): #train model
     create_repo(repo_id=repo_name, token=api_token, exist_ok=True)
     # Upload the model and tokenizer to the Hugging Face repository
     upload_folder(
         folder_path=f"{model_save_path}",
         path_in_repo=f"{model_save_path}",
         repo_id=repo_name,
         token=api_token,
-        commit_message="Push model and tokenizer",
     )
-    print("Operation complete for fine-tunning.")
 else:
     print('Load Pre-trained')
     model_save_path = f"./{model_save_path}"
@@ -314,6 +348,10 @@ else:
     model = AutoModelForSequenceClassification.from_pretrained(model_save_path).to('cpu')
     tokenizer = AutoTokenizer.from_pretrained(tokenizer_save_path)
 #Function to classify user input
 def classify_user_input(user_input):
     while True:
@@ -366,4 +404,4 @@ def classify_user_input(user_input):
 iface = gr.Interface(fn=classify_user_input, inputs="text", outputs="text")
-iface.launch(share=True)

     #settings
     model_save_path = path_to_save_trained_model_to
     bias_non_fleet = 1.0
+    epochs_to_run = 15
     file_path_train = train_file + ".csv"
     file_path_test = test_file + ".csv"
     # Read the CSV files into pandas DataFrames they will later by converted to DataTables and used to train and evaluate the model
+    file_train_df = pd.read_csv(file_path_train)
     file_test_df = pd.read_csv(file_path_test)
     repo_name = "Reyad-Ahmmed/hf-data-timeframe"
+    # Tokenization - get Tokenizer for roberta-base (must match model - also roberta-base)
+    # tokenizer = BertTokenizer.from_pretrained('./mitra_ai_fleet_bert_tokenizer')
     tokenizer = BertTokenizer.from_pretrained(repo_name, subfolder="bert_embeddings_finetune")
     # I made sure to add all the ones in the training and eval data to this list
     # since we are training using data that only contains the left tag - we don't need right tags added to this list
     new_tokens = ['<EMPLOYEE_FIRST_NAME>', '<EMPLOYEE_LAST_NAME>','<POINT_ADDRESS>', '<TRUCK_NAME>', '<POINT_CLASS_NAME>', '<POINT_NAME>', '<TRUCK_CLASS_NAME>', '<TRUCK_STATUS_NAME>]']
     # Model
+    model = BertForSequenceClassification.from_pretrained(repo_name, subfolder="bert_embeddings_finetune", output_attentions=True, num_labels=len(label_mapping), output_hidden_states=True).to('cuda')
+    # model = BertForSequenceClassification.from_pretrained('./mitra_ai_fleet_bert', output_attentions=True, num_labels=len(label_mapping), output_hidden_states=True).to('cuda')
     # Reset tokenizer size to include the new size after adding the tags to the tokenizer's tokens
     model.resize_token_embeddings(len(tokenizer))
     emotions_dataset_train = Dataset.from_dict(emotions_dict_train)
     emotions_dataset_test = Dataset.from_dict(emotions_dict_test)
     # Step 4: Split dataset into train and validation
     # Create top level dictionary with both datasets (will contain two keys: one for "train" whose value is the training dataset
     # and one for "validation" with test dataset)
         'validation': emotions_dataset_test
     })
     # Define the tokenize function
     def tokenize(batch):
         return tokenizer(batch["text"], padding=True, truncation=True)
     # Apply tokenization by mapping the entire dataset (both training and validation) to tokenizer function
     # this will add the "input_id" and "attention_mask" columns
     emotions_encoded = emotions_encoded.map(tokenize, batched=True)
         accuracy = (preds == labels).astype(float).mean()
         return {"accuracy": accuracy}
     training_args = TrainingArguments(
         output_dir='./results',
         num_train_epochs=epochs_to_run,
         evaluation_strategy="epoch",
     )
+    # notice the bias_non_float in next line (it is given a value at top of code)
+    # class_weights = torch.tensor([1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,bias_non_fleet,1.0,1.0])  # Replace with your actual class weights
+    # class_weights = class_weights.to('cuda' if torch.cuda.is_available() else 'cpu')
     # This is needed b/c loss_fn is swapped out in order to use weighted loss
     # Any class weights that are not equal to one will make the model more (if greater than one) or less (if less than one)sensitive to given label
     class CustomTrainer(Trainer):
             return (loss, outputs) if return_outputs else loss
+    # trainer = CustomTrainer(
+    #     model=model,
+    #     compute_metrics=compute_metrics,
+    #     args=training_args,
+    #     train_dataset=emotions_encoded["train"],
+    #     eval_dataset=emotions_encoded["validation"],
+    #     tokenizer=tokenizer    )
     trainer = Trainer(
         model=model,
         args=training_args,
         tokenizer=tokenizer
     )
+    # Train the model and set timer to measure the training time
+    start_time = time.time()
+    trainer.train()
+    end_time = time.time()
+    execution_time = end_time - start_time
+    print(f"Execution Time: {execution_time:.2f} seconds")
     # send validation prompts through the model - will be used in error-analysis matrix below
     preds_output = trainer.predict(emotions_encoded["validation"])
     # Save the model and tokenizer
     model.save_pretrained(f"./{model_save_path}")
+    tokenizer.save_pretrained('./saved_fleet_tokenizer')
     #for push repository
     repo_name = "Reyad-Ahmmed/hf-data-timeframe"
     create_repo(repo_id=repo_name, token=api_token, exist_ok=True)
     # Upload the model and tokenizer to the Hugging Face repository
     upload_folder(
         folder_path=f"{model_save_path}",
         path_in_repo=f"{model_save_path}",
         repo_id=repo_name,
         token=api_token,
+        commit_message="Push fleet model",
+        #overwrite=True  # Force overwrite existing files
     )
+    upload_folder(
+        folder_path="saved_fleet_tokenizer",
+        path_in_repo="saved_fleet_tokenizer",
+        repo_id=repo_name,
+        token=api_token,
+        commit_message="Push fleet tokenizer",
+        #overwrite=True  # Force overwrite existing files
+    )
 else:
     print('Load Pre-trained')
     model_save_path = f"./{model_save_path}"
     model = AutoModelForSequenceClassification.from_pretrained(model_save_path).to('cpu')
     tokenizer = AutoTokenizer.from_pretrained(tokenizer_save_path)
+#Define the label mappings (this must match the mapping used during training)
+label_mapping = model.config.label_mapping
+label_mapping_reverse = {value: key for key, value in label_mapping.items()}
 #Function to classify user input
 def classify_user_input(user_input):
     while True:
 iface = gr.Interface(fn=classify_user_input, inputs="text", outputs="text")
+iface.launch(share=True)