HF_Python / app.py
Reyad-Ahmmed's picture
Update app.py
a117686 verified
#python hf-fine-tune-fleet-8.py 1 train_fleet test_fleet 1 1 saved_fleet_model
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from transformers import RobertaTokenizer, RobertaForSequenceClassification
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import sys
import torch.nn.functional as F
from torch.nn import CrossEntropyLoss
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import re
from datasets import load_dataset, DatasetDict
import time
import pprint
import json
from huggingface_hub import HfApi, login, upload_folder, create_repo
import os
from flask import Flask, jsonify, request
import requests
from fetch_data import fetch_and_update_training_data
import gradio as gr
# Load configuration file
with open('config.json', 'r') as config_file:
config = json.load(config_file)
num_args = len(config)
arg1 = config.get('arg1', '1')
arg2 = config.get('arg2', 'train_fleet')
arg3 = config.get('arg3', 'test_fleet')
arg4 = config.get('arg4', '1')
arg5 = config.get('arg5', '1')
arg6 = config.get('arg6', 'saved_fleet_model')
arg7 = config.get('arg7', 'Model')
if num_args == 7:
# cmd args
# sys.argv[0] is the script name, sys.argv[1] is the first argument, etc.
should_train_model = arg1 # should train model?
train_file = arg2 # training file name
test_file = arg3 # eval file name
batch_size_for_trainer = int(arg4) # batch sizes to send to trainer
should_produce_eval_matrix = int(arg5) # should produce matrix?
path_to_save_trained_model_to = arg6
else:
print(f"Only {num_args-1} arguments after filename were passed out of 6")
sys.exit()
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0" #only use 1 of my GPS (in case very weak ones are installed which would slow the training down)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
if (should_train_model=='1'): #train model
#settings
model_save_path = path_to_save_trained_model_to
bias_non_fleet = 1.0
epochs_to_run = 15
file_path_train = train_file + ".csv"
file_path_test = test_file + ".csv"
# Read the CSV files into pandas DataFrames they will later by converted to DataTables and used to train and evaluate the model
file_train_df = fetch_and_update_training_data(file_path_train)
file_test_df = pd.read_csv(file_path_test)
#combine dataframes to get all possible labels/classifications for both training and evaluating - to get all possible labels (intents)
df = pd.concat([file_train_df, file_test_df], ignore_index=True)
sorted_labels = sorted(df['label'].unique())
#create labels map from unique sorted labels
label_mapping = {label: i for i, label in enumerate(sorted_labels)}
print("label mappings")
print(label_mapping)
repo_name = "Reyad-Ahmmed/hf-data-timeframe"
# Tokenization - get Tokenizer for roberta-base (must match model - also roberta-base)
# tokenizer = BertTokenizer.from_pretrained('./mitra_ai_fleet_bert_tokenizer')
tokenizer = BertTokenizer.from_pretrained(repo_name, subfolder="bert_embeddings_finetune")
# I made sure to add all the ones in the training and eval data to this list
# since we are training using data that only contains the left tag - we don't need right tags added to this list
new_tokens = ['<EMPLOYEE_FIRST_NAME>', '<EMPLOYEE_LAST_NAME>','<POINT_ADDRESS>', '<TRUCK_NAME>', '<POINT_CLASS_NAME>', '<POINT_NAME>', '<TRUCK_CLASS_NAME>', '<TRUCK_STATUS_NAME>]']
tokenizer.add_tokens(new_tokens)
# Model
model = BertForSequenceClassification.from_pretrained(repo_name, subfolder="bert_embeddings_finetune", output_attentions=True, num_labels=len(label_mapping), output_hidden_states=True).to('cuda')
# model = BertForSequenceClassification.from_pretrained('./mitra_ai_fleet_bert', output_attentions=True, num_labels=len(label_mapping), output_hidden_states=True).to('cuda')
# Reset tokenizer size to include the new size after adding the tags to the tokenizer's tokens
model.resize_token_embeddings(len(tokenizer))
#important_tokens = ["Acura-New", "TR-9012", "TR-NEW-02"]
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
# Step 2: Convert string labels to integers
# Create a mapping from unique labels (strings) to integers
label_to_id = {label: idx for idx, label in enumerate(sorted(df["label"].unique()))}
print(label_to_id)
# Dataframes contain prompts and label names
print('before converting labels to labelIds')
pprint.pp(file_train_df)
pprint.pp(file_test_df)
# Apply the mapping to the labels to id (will swap out the label names with label id to the dataframes)
file_train_df["label"] = file_train_df["label"].map(label_to_id)
file_test_df["label"] = file_test_df["label"].map(label_to_id)
print('after swapping out label names with Ids')
pprint.pp(file_train_df)
pprint.pp(file_test_df)
# Step 3: Convert both dataframes to dictionaries
emotions_dict_train = {"text": file_train_df["text"].tolist(), "label": file_train_df["label"].tolist()}
emotions_dict_test = {"text": file_test_df["text"].tolist(), "label": file_test_df["label"].tolist()}
print('dictionaries')
pprint.pp(emotions_dict_train)
pprint.pp(emotions_dict_test)
# convert dictionaries to datasets
emotions_dataset_train = Dataset.from_dict(emotions_dict_train)
emotions_dataset_test = Dataset.from_dict(emotions_dict_test)
# Step 4: Split dataset into train and validation
# Create top level dictionary with both datasets (will contain two keys: one for "train" whose value is the training dataset
# and one for "validation" with test dataset)
emotions_encoded = DatasetDict({
'train': emotions_dataset_train,
'validation': emotions_dataset_test
})
# Define the tokenize function
def tokenize(batch):
return tokenizer(batch["text"], padding=True, truncation=True)
# Apply tokenization by mapping the entire dataset (both training and validation) to tokenizer function
# this will add the "input_id" and "attention_mask" columns
emotions_encoded = emotions_encoded.map(tokenize, batched=True)
emotions_encoded.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
# Set the model to evaluation mode (this line does not run any training or eval)
model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
from sklearn.metrics import accuracy_score, f1_score
# Define additional compute_metrics (used as part of error-analysis - produces "accuracy" metric which can be used in another program
# that shows any training prompts with large losses)
def compute_metrics(pred):
logits = pred.predictions[0] if isinstance(pred.predictions, tuple) else pred.predictions
preds = logits.argmax(-1)
labels = pred.label_ids
accuracy = (preds == labels).astype(float).mean()
return {"accuracy": accuracy}
training_args = TrainingArguments(
output_dir='./results',
num_train_epochs=epochs_to_run,
per_device_train_batch_size=batch_size_for_trainer,
per_device_eval_batch_size=batch_size_for_trainer,
warmup_steps=500,
learning_rate=2e-5,
weight_decay=0.02,
logging_dir='./logs',
logging_steps=10,
evaluation_strategy="epoch",
)
# notice the bias_non_float in next line (it is given a value at top of code)
# class_weights = torch.tensor([1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,bias_non_fleet,1.0,1.0]) # Replace with your actual class weights
# class_weights = class_weights.to('cuda' if torch.cuda.is_available() else 'cpu')
# This is needed b/c loss_fn is swapped out in order to use weighted loss
# Any class weights that are not equal to one will make the model more (if greater than one) or less (if less than one)sensitive to given label
class CustomTrainer(Trainer):
def compute_loss(self, model, inputs, return_outputs=False):
labels = inputs.get("labels")
outputs = model(**inputs)
logits = outputs.get("logits")
# Use cross-entropy loss with class weights
# loss_fn = torch.nn.CrossEntropyLoss(weight=class_weights)
loss_fn = torch.nn.CrossEntropyLoss()
loss = loss_fn(logits, labels)
return (loss, outputs) if return_outputs else loss
# trainer = CustomTrainer(
# model=model,
# compute_metrics=compute_metrics,
# args=training_args,
# train_dataset=emotions_encoded["train"],
# eval_dataset=emotions_encoded["validation"],
# tokenizer=tokenizer )
trainer = Trainer(
model=model,
args=training_args,
train_dataset=emotions_encoded["train"],
eval_dataset=emotions_encoded["validation"],
tokenizer=tokenizer
)
# Train the model and set timer to measure the training time
start_time = time.time()
trainer.train()
end_time = time.time()
execution_time = end_time - start_time
print(f"Execution Time: {execution_time:.2f} seconds")
# send validation prompts through the model - will be used in error-analysis matrix below
preds_output = trainer.predict(emotions_encoded["validation"])
#################This section creates a error analysis matrix
# Extract the logits from the predictions output
logits = preds_output.predictions[0] if isinstance(preds_output.predictions, tuple) else preds_output.predictions
# Get the predicted class by applying argmax on the logits
y_preds = np.argmax(logits, axis=1) #prediction
y_valid = np.array(emotions_encoded["validation"]["label"]) #labels
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
#num_labels2 = len(label_mapping)
print("Ypreds and valids shape")
print(y_preds.shape, y_valid.shape)
# Define the function to plot the confusion matrix
def plot_confusion_matrix_with_text_labels(y_preds, y_true, labels):
# Compute confusion matrix
cm = confusion_matrix(y_true, y_preds,normalize="true")
# Plot confusion matrix
fig, ax = plt.subplots(figsize=(len(labels), len(labels)))
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
disp.plot(cmap="Blues", values_format=".2f", ax=ax, colorbar=False)
# Rotate the x-axis labels to prevent overlap
plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")
# Ensure the plot is displayed
plt.title("Normalized Confusion Matrix with Text Labels")
plt.tight_layout()
plt.savefig("confusion_matrix.png")
plt.show()
# Get unique labels for validation data only - this will be shown in the matrix
unique_labels = sorted(set(y_valid) | set(y_preds))
id_to_label = {v: k for k, v in label_to_id.items()}
labels = [id_to_label[label] for label in unique_labels]
print ("unique_labels")
print(labels)
# Call the function with the correct labels
if(should_produce_eval_matrix == 1):
plot_confusion_matrix_with_text_labels(y_preds, y_valid, labels)
#the label mapping will be saved in the model - and retrieved by any other program using the model -
# for instance the pathway through this code used for inference only will retrieve this value
# (or like the Python program that measures poor accuracies)
model.config.label_mapping = label_mapping
# Save the model and tokenizer
model.save_pretrained(f"./{model_save_path}")
tokenizer.save_pretrained(f"./{model_save_path}")
#for push repository
repo_name = "Reyad-Ahmmed/hf-data-timeframe"
# Your repository name
api_token = os.getenv("hf_token") # Retrieve the API token from environment variable
if not api_token:
raise ValueError("API token not found. Please set the HF_API_TOKEN environment variable.")
# Create repository (if not already created)
api = HfApi()
create_repo(repo_id=repo_name, token=api_token, exist_ok=True)
# Upload the model and tokenizer to the Hugging Face repository
upload_folder(
folder_path=f"{model_save_path}",
path_in_repo=f"{model_save_path}",
repo_id=repo_name,
token=api_token,
commit_message="Push fleet model",
#overwrite=True # Force overwrite existing files
)
else:
print('Load Pre-trained')
model_save_path = f"./{model_save_path}"
tokenizer_save_path = f"./{model_save_path}"
# RobertaTokenizer.from_pretrained(model_save_path)
model = AutoModelForSequenceClassification.from_pretrained(model_save_path).to('cpu')
tokenizer = AutoTokenizer.from_pretrained(tokenizer_save_path)
#Define the label mappings (this must match the mapping used during training)
label_mapping = model.config.label_mapping
label_mapping_reverse = {value: key for key, value in label_mapping.items()}
#Function to classify user input
def classify_user_input(user_input):
while True:
# Tokenize and predict
input_encoding = tokenizer(user_input, padding=True, truncation=True, return_tensors="pt").to('cuda')
with torch.no_grad():
#attention_mask = input_encoding['attention_mask'].clone()
# Modify the attention mask to emphasize certain key tokens
for idx, token_id in enumerate(input_encoding['input_ids'][0]):
word = tokenizer.decode([token_id])
print(word)
#if word.strip() in ["point", "summarize", "oil", "maintenance"]: # Target key tokens
#attention_mask[0, idx] = 2 # Increase attention weight for these words
# else:
# attention_mask[0, idx] = 0
#print (attention_mask)
#input_encoding['attention_mask'] = attention_mask
output = model(**input_encoding, output_hidden_states=True)
# print('start-logits')
# print(output.logits)
# print('end-logits')
#print(output)
attention = output.attentions # Get attention scores
#print('atten')
#print(attention)
# Apply softmax to get the probabilities (confidence scores)
probabilities = F.softmax(output.logits, dim=-1)
# tokens = tokenizer.convert_ids_to_tokens(input_encoding['input_ids'][0].cpu().numpy())
# # Display the attention visualization
# input_text = tokenizer.convert_ids_to_tokens(input_encoding['input_ids'][0])
prediction = torch.argmax(output.logits, dim=1).cpu().numpy()
# Map prediction back to label
print(prediction)
predicted_label = label_mapping_reverse[prediction[0]]
print(f"Predicted intent: {predicted_label}\n")
# Print the confidence for each label
print("\nLabel Confidence Scores:")
for i, label in label_mapping_reverse.items():
confidence = probabilities[0][i].item() # Get confidence score for each label
print(f"{label}: {confidence:.4f}")
print("\n")
iface = gr.Interface(fn=classify_user_input, inputs="text", outputs="text")
iface.launch(share=True)