#python hf-fine-tune-fleet-8.py 1 train_fleet test_fleet 1 1 saved_fleet_model import pandas as pd from sklearn.model_selection import train_test_split from transformers import BertTokenizer, BertForSequenceClassification, AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments import torch from torch.utils.data import Dataset from torch.utils.data import DataLoader from transformers import RobertaTokenizer, RobertaForSequenceClassification import pandas as pd from sklearn.model_selection import train_test_split from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score, confusion_matrix import matplotlib.pyplot as plt import seaborn as sns import numpy as np import sys import torch.nn.functional as F from torch.nn import CrossEntropyLoss from sklearn.decomposition import PCA import matplotlib.pyplot as plt import re from datasets import load_dataset, DatasetDict import time import pprint import json from huggingface_hub import HfApi, login, upload_folder, create_repo import os from flask import Flask, jsonify, request import requests from fetch_data import fetch_and_update_training_data import gradio as gr # Load configuration file with open('config.json', 'r') as config_file: config = json.load(config_file) num_args = len(config) arg1 = config.get('arg1', '1') arg2 = config.get('arg2', 'train_fleet') arg3 = config.get('arg3', 'test_fleet') arg4 = config.get('arg4', '1') arg5 = config.get('arg5', '1') arg6 = config.get('arg6', 'saved_fleet_model') arg7 = config.get('arg7', 'Model') if num_args == 7: # cmd args # sys.argv[0] is the script name, sys.argv[1] is the first argument, etc. should_train_model = arg1 # should train model? train_file = arg2 # training file name test_file = arg3 # eval file name batch_size_for_trainer = int(arg4) # batch sizes to send to trainer should_produce_eval_matrix = int(arg5) # should produce matrix? path_to_save_trained_model_to = arg6 else: print(f"Only {num_args-1} arguments after filename were passed out of 6") sys.exit() import os os.environ["CUDA_VISIBLE_DEVICES"] = "0" #only use 1 of my GPS (in case very weak ones are installed which would slow the training down) device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') if (should_train_model=='1'): #train model #settings model_save_path = path_to_save_trained_model_to bias_non_fleet = 1.0 epochs_to_run = 15 file_path_train = train_file + ".csv" file_path_test = test_file + ".csv" # Read the CSV files into pandas DataFrames they will later by converted to DataTables and used to train and evaluate the model file_train_df = fetch_and_update_training_data(file_path_train) file_test_df = pd.read_csv(file_path_test) #combine dataframes to get all possible labels/classifications for both training and evaluating - to get all possible labels (intents) df = pd.concat([file_train_df, file_test_df], ignore_index=True) sorted_labels = sorted(df['label'].unique()) #create labels map from unique sorted labels label_mapping = {label: i for i, label in enumerate(sorted_labels)} print("label mappings") print(label_mapping) repo_name = "Reyad-Ahmmed/hf-data-timeframe" # Tokenization - get Tokenizer for roberta-base (must match model - also roberta-base) # tokenizer = BertTokenizer.from_pretrained('./mitra_ai_fleet_bert_tokenizer') tokenizer = BertTokenizer.from_pretrained(repo_name, subfolder="bert_embeddings_finetune") # I made sure to add all the ones in the training and eval data to this list # since we are training using data that only contains the left tag - we don't need right tags added to this list new_tokens = ['', '','', '', '', '', '', ']'] tokenizer.add_tokens(new_tokens) # Model model = BertForSequenceClassification.from_pretrained(repo_name, subfolder="bert_embeddings_finetune", output_attentions=True, num_labels=len(label_mapping), output_hidden_states=True).to('cuda') # model = BertForSequenceClassification.from_pretrained('./mitra_ai_fleet_bert', output_attentions=True, num_labels=len(label_mapping), output_hidden_states=True).to('cuda') # Reset tokenizer size to include the new size after adding the tags to the tokenizer's tokens model.resize_token_embeddings(len(tokenizer)) #important_tokens = ["Acura-New", "TR-9012", "TR-NEW-02"] from datasets import Dataset, DatasetDict from sklearn.model_selection import train_test_split # Step 2: Convert string labels to integers # Create a mapping from unique labels (strings) to integers label_to_id = {label: idx for idx, label in enumerate(sorted(df["label"].unique()))} print(label_to_id) # Dataframes contain prompts and label names print('before converting labels to labelIds') pprint.pp(file_train_df) pprint.pp(file_test_df) # Apply the mapping to the labels to id (will swap out the label names with label id to the dataframes) file_train_df["label"] = file_train_df["label"].map(label_to_id) file_test_df["label"] = file_test_df["label"].map(label_to_id) print('after swapping out label names with Ids') pprint.pp(file_train_df) pprint.pp(file_test_df) # Step 3: Convert both dataframes to dictionaries emotions_dict_train = {"text": file_train_df["text"].tolist(), "label": file_train_df["label"].tolist()} emotions_dict_test = {"text": file_test_df["text"].tolist(), "label": file_test_df["label"].tolist()} print('dictionaries') pprint.pp(emotions_dict_train) pprint.pp(emotions_dict_test) # convert dictionaries to datasets emotions_dataset_train = Dataset.from_dict(emotions_dict_train) emotions_dataset_test = Dataset.from_dict(emotions_dict_test) # Step 4: Split dataset into train and validation # Create top level dictionary with both datasets (will contain two keys: one for "train" whose value is the training dataset # and one for "validation" with test dataset) emotions_encoded = DatasetDict({ 'train': emotions_dataset_train, 'validation': emotions_dataset_test }) # Define the tokenize function def tokenize(batch): return tokenizer(batch["text"], padding=True, truncation=True) # Apply tokenization by mapping the entire dataset (both training and validation) to tokenizer function # this will add the "input_id" and "attention_mask" columns emotions_encoded = emotions_encoded.map(tokenize, batched=True) emotions_encoded.set_format(type="torch", columns=["input_ids", "attention_mask", "label"]) # Set the model to evaluation mode (this line does not run any training or eval) model.eval() device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) from sklearn.metrics import accuracy_score, f1_score # Define additional compute_metrics (used as part of error-analysis - produces "accuracy" metric which can be used in another program # that shows any training prompts with large losses) def compute_metrics(pred): logits = pred.predictions[0] if isinstance(pred.predictions, tuple) else pred.predictions preds = logits.argmax(-1) labels = pred.label_ids accuracy = (preds == labels).astype(float).mean() return {"accuracy": accuracy} training_args = TrainingArguments( output_dir='./results', num_train_epochs=epochs_to_run, per_device_train_batch_size=batch_size_for_trainer, per_device_eval_batch_size=batch_size_for_trainer, warmup_steps=500, learning_rate=2e-5, weight_decay=0.02, logging_dir='./logs', logging_steps=10, evaluation_strategy="epoch", ) # notice the bias_non_float in next line (it is given a value at top of code) # class_weights = torch.tensor([1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,bias_non_fleet,1.0,1.0]) # Replace with your actual class weights # class_weights = class_weights.to('cuda' if torch.cuda.is_available() else 'cpu') # This is needed b/c loss_fn is swapped out in order to use weighted loss # Any class weights that are not equal to one will make the model more (if greater than one) or less (if less than one)sensitive to given label class CustomTrainer(Trainer): def compute_loss(self, model, inputs, return_outputs=False): labels = inputs.get("labels") outputs = model(**inputs) logits = outputs.get("logits") # Use cross-entropy loss with class weights # loss_fn = torch.nn.CrossEntropyLoss(weight=class_weights) loss_fn = torch.nn.CrossEntropyLoss() loss = loss_fn(logits, labels) return (loss, outputs) if return_outputs else loss # trainer = CustomTrainer( # model=model, # compute_metrics=compute_metrics, # args=training_args, # train_dataset=emotions_encoded["train"], # eval_dataset=emotions_encoded["validation"], # tokenizer=tokenizer ) trainer = Trainer( model=model, args=training_args, train_dataset=emotions_encoded["train"], eval_dataset=emotions_encoded["validation"], tokenizer=tokenizer ) # Train the model and set timer to measure the training time start_time = time.time() trainer.train() end_time = time.time() execution_time = end_time - start_time print(f"Execution Time: {execution_time:.2f} seconds") # send validation prompts through the model - will be used in error-analysis matrix below preds_output = trainer.predict(emotions_encoded["validation"]) #################This section creates a error analysis matrix # Extract the logits from the predictions output logits = preds_output.predictions[0] if isinstance(preds_output.predictions, tuple) else preds_output.predictions # Get the predicted class by applying argmax on the logits y_preds = np.argmax(logits, axis=1) #prediction y_valid = np.array(emotions_encoded["validation"]["label"]) #labels from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay import matplotlib.pyplot as plt import numpy as np from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix #num_labels2 = len(label_mapping) print("Ypreds and valids shape") print(y_preds.shape, y_valid.shape) # Define the function to plot the confusion matrix def plot_confusion_matrix_with_text_labels(y_preds, y_true, labels): # Compute confusion matrix cm = confusion_matrix(y_true, y_preds,normalize="true") # Plot confusion matrix fig, ax = plt.subplots(figsize=(len(labels), len(labels))) disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels) disp.plot(cmap="Blues", values_format=".2f", ax=ax, colorbar=False) # Rotate the x-axis labels to prevent overlap plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor") # Ensure the plot is displayed plt.title("Normalized Confusion Matrix with Text Labels") plt.tight_layout() plt.savefig("confusion_matrix.png") plt.show() # Get unique labels for validation data only - this will be shown in the matrix unique_labels = sorted(set(y_valid) | set(y_preds)) id_to_label = {v: k for k, v in label_to_id.items()} labels = [id_to_label[label] for label in unique_labels] print ("unique_labels") print(labels) # Call the function with the correct labels if(should_produce_eval_matrix == 1): plot_confusion_matrix_with_text_labels(y_preds, y_valid, labels) #the label mapping will be saved in the model - and retrieved by any other program using the model - # for instance the pathway through this code used for inference only will retrieve this value # (or like the Python program that measures poor accuracies) model.config.label_mapping = label_mapping # Save the model and tokenizer model.save_pretrained(f"./{model_save_path}") tokenizer.save_pretrained(f"./{model_save_path}") #for push repository repo_name = "Reyad-Ahmmed/hf-data-timeframe" # Your repository name api_token = os.getenv("hf_token") # Retrieve the API token from environment variable if not api_token: raise ValueError("API token not found. Please set the HF_API_TOKEN environment variable.") # Create repository (if not already created) api = HfApi() create_repo(repo_id=repo_name, token=api_token, exist_ok=True) # Upload the model and tokenizer to the Hugging Face repository upload_folder( folder_path=f"{model_save_path}", path_in_repo=f"{model_save_path}", repo_id=repo_name, token=api_token, commit_message="Push fleet model", #overwrite=True # Force overwrite existing files ) else: print('Load Pre-trained') model_save_path = f"./{model_save_path}" tokenizer_save_path = f"./{model_save_path}" # RobertaTokenizer.from_pretrained(model_save_path) model = AutoModelForSequenceClassification.from_pretrained(model_save_path).to('cpu') tokenizer = AutoTokenizer.from_pretrained(tokenizer_save_path) #Define the label mappings (this must match the mapping used during training) label_mapping = model.config.label_mapping label_mapping_reverse = {value: key for key, value in label_mapping.items()} #Function to classify user input def classify_user_input(user_input): while True: # Tokenize and predict input_encoding = tokenizer(user_input, padding=True, truncation=True, return_tensors="pt").to('cuda') with torch.no_grad(): #attention_mask = input_encoding['attention_mask'].clone() # Modify the attention mask to emphasize certain key tokens for idx, token_id in enumerate(input_encoding['input_ids'][0]): word = tokenizer.decode([token_id]) print(word) #if word.strip() in ["point", "summarize", "oil", "maintenance"]: # Target key tokens #attention_mask[0, idx] = 2 # Increase attention weight for these words # else: # attention_mask[0, idx] = 0 #print (attention_mask) #input_encoding['attention_mask'] = attention_mask output = model(**input_encoding, output_hidden_states=True) # print('start-logits') # print(output.logits) # print('end-logits') #print(output) attention = output.attentions # Get attention scores #print('atten') #print(attention) # Apply softmax to get the probabilities (confidence scores) probabilities = F.softmax(output.logits, dim=-1) # tokens = tokenizer.convert_ids_to_tokens(input_encoding['input_ids'][0].cpu().numpy()) # # Display the attention visualization # input_text = tokenizer.convert_ids_to_tokens(input_encoding['input_ids'][0]) prediction = torch.argmax(output.logits, dim=1).cpu().numpy() # Map prediction back to label print(prediction) predicted_label = label_mapping_reverse[prediction[0]] print(f"Predicted intent: {predicted_label}\n") # Print the confidence for each label print("\nLabel Confidence Scores:") for i, label in label_mapping_reverse.items(): confidence = probabilities[0][i].item() # Get confidence score for each label print(f"{label}: {confidence:.4f}") print("\n") iface = gr.Interface(fn=classify_user_input, inputs="text", outputs="text") iface.launch(share=True)