Spaces:
Runtime error
Runtime error
#python hf-fine-tune-fleet-8.py 1 train_fleet test_fleet 1 1 saved_fleet_model | |
import pandas as pd | |
from sklearn.model_selection import train_test_split | |
from transformers import BertTokenizer, BertForSequenceClassification, AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments | |
import torch | |
from torch.utils.data import Dataset | |
from torch.utils.data import DataLoader | |
from transformers import RobertaTokenizer, RobertaForSequenceClassification | |
import pandas as pd | |
from sklearn.model_selection import train_test_split | |
from sklearn.linear_model import LogisticRegression | |
from sklearn.metrics import accuracy_score, confusion_matrix | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
import numpy as np | |
import sys | |
import torch.nn.functional as F | |
from torch.nn import CrossEntropyLoss | |
from sklearn.decomposition import PCA | |
import matplotlib.pyplot as plt | |
import re | |
from datasets import load_dataset, DatasetDict | |
import time | |
import pprint | |
import json | |
from huggingface_hub import HfApi, login, upload_folder, create_repo | |
import os | |
from flask import Flask, jsonify, request | |
import requests | |
from fetch_data import fetch_and_update_training_data | |
import gradio as gr | |
# Load configuration file | |
with open('config.json', 'r') as config_file: | |
config = json.load(config_file) | |
num_args = len(config) | |
arg1 = config.get('arg1', '1') | |
arg2 = config.get('arg2', 'train_fleet') | |
arg3 = config.get('arg3', 'test_fleet') | |
arg4 = config.get('arg4', '1') | |
arg5 = config.get('arg5', '1') | |
arg6 = config.get('arg6', 'saved_fleet_model') | |
arg7 = config.get('arg7', 'Model') | |
if num_args == 7: | |
# cmd args | |
# sys.argv[0] is the script name, sys.argv[1] is the first argument, etc. | |
should_train_model = arg1 # should train model? | |
train_file = arg2 # training file name | |
test_file = arg3 # eval file name | |
batch_size_for_trainer = int(arg4) # batch sizes to send to trainer | |
should_produce_eval_matrix = int(arg5) # should produce matrix? | |
path_to_save_trained_model_to = arg6 | |
else: | |
print(f"Only {num_args-1} arguments after filename were passed out of 6") | |
sys.exit() | |
import os | |
os.environ["CUDA_VISIBLE_DEVICES"] = "0" #only use 1 of my GPS (in case very weak ones are installed which would slow the training down) | |
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') | |
if (should_train_model=='1'): #train model | |
#settings | |
model_save_path = path_to_save_trained_model_to | |
bias_non_fleet = 1.0 | |
epochs_to_run = 15 | |
file_path_train = train_file + ".csv" | |
file_path_test = test_file + ".csv" | |
# Read the CSV files into pandas DataFrames they will later by converted to DataTables and used to train and evaluate the model | |
file_train_df = fetch_and_update_training_data(file_path_train) | |
file_test_df = pd.read_csv(file_path_test) | |
#combine dataframes to get all possible labels/classifications for both training and evaluating - to get all possible labels (intents) | |
df = pd.concat([file_train_df, file_test_df], ignore_index=True) | |
sorted_labels = sorted(df['label'].unique()) | |
#create labels map from unique sorted labels | |
label_mapping = {label: i for i, label in enumerate(sorted_labels)} | |
print("label mappings") | |
print(label_mapping) | |
repo_name = "Reyad-Ahmmed/hf-data-timeframe" | |
# Tokenization - get Tokenizer for roberta-base (must match model - also roberta-base) | |
# tokenizer = BertTokenizer.from_pretrained('./mitra_ai_fleet_bert_tokenizer') | |
tokenizer = BertTokenizer.from_pretrained(repo_name, subfolder="bert_embeddings_finetune") | |
# I made sure to add all the ones in the training and eval data to this list | |
# since we are training using data that only contains the left tag - we don't need right tags added to this list | |
new_tokens = ['<EMPLOYEE_FIRST_NAME>', '<EMPLOYEE_LAST_NAME>','<POINT_ADDRESS>', '<TRUCK_NAME>', '<POINT_CLASS_NAME>', '<POINT_NAME>', '<TRUCK_CLASS_NAME>', '<TRUCK_STATUS_NAME>]'] | |
tokenizer.add_tokens(new_tokens) | |
# Model | |
model = BertForSequenceClassification.from_pretrained(repo_name, subfolder="bert_embeddings_finetune", output_attentions=True, num_labels=len(label_mapping), output_hidden_states=True).to('cuda') | |
# model = BertForSequenceClassification.from_pretrained('./mitra_ai_fleet_bert', output_attentions=True, num_labels=len(label_mapping), output_hidden_states=True).to('cuda') | |
# Reset tokenizer size to include the new size after adding the tags to the tokenizer's tokens | |
model.resize_token_embeddings(len(tokenizer)) | |
#important_tokens = ["Acura-New", "TR-9012", "TR-NEW-02"] | |
from datasets import Dataset, DatasetDict | |
from sklearn.model_selection import train_test_split | |
# Step 2: Convert string labels to integers | |
# Create a mapping from unique labels (strings) to integers | |
label_to_id = {label: idx for idx, label in enumerate(sorted(df["label"].unique()))} | |
print(label_to_id) | |
# Dataframes contain prompts and label names | |
print('before converting labels to labelIds') | |
pprint.pp(file_train_df) | |
pprint.pp(file_test_df) | |
# Apply the mapping to the labels to id (will swap out the label names with label id to the dataframes) | |
file_train_df["label"] = file_train_df["label"].map(label_to_id) | |
file_test_df["label"] = file_test_df["label"].map(label_to_id) | |
print('after swapping out label names with Ids') | |
pprint.pp(file_train_df) | |
pprint.pp(file_test_df) | |
# Step 3: Convert both dataframes to dictionaries | |
emotions_dict_train = {"text": file_train_df["text"].tolist(), "label": file_train_df["label"].tolist()} | |
emotions_dict_test = {"text": file_test_df["text"].tolist(), "label": file_test_df["label"].tolist()} | |
print('dictionaries') | |
pprint.pp(emotions_dict_train) | |
pprint.pp(emotions_dict_test) | |
# convert dictionaries to datasets | |
emotions_dataset_train = Dataset.from_dict(emotions_dict_train) | |
emotions_dataset_test = Dataset.from_dict(emotions_dict_test) | |
# Step 4: Split dataset into train and validation | |
# Create top level dictionary with both datasets (will contain two keys: one for "train" whose value is the training dataset | |
# and one for "validation" with test dataset) | |
emotions_encoded = DatasetDict({ | |
'train': emotions_dataset_train, | |
'validation': emotions_dataset_test | |
}) | |
# Define the tokenize function | |
def tokenize(batch): | |
return tokenizer(batch["text"], padding=True, truncation=True) | |
# Apply tokenization by mapping the entire dataset (both training and validation) to tokenizer function | |
# this will add the "input_id" and "attention_mask" columns | |
emotions_encoded = emotions_encoded.map(tokenize, batched=True) | |
emotions_encoded.set_format(type="torch", columns=["input_ids", "attention_mask", "label"]) | |
# Set the model to evaluation mode (this line does not run any training or eval) | |
model.eval() | |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
model.to(device) | |
from sklearn.metrics import accuracy_score, f1_score | |
# Define additional compute_metrics (used as part of error-analysis - produces "accuracy" metric which can be used in another program | |
# that shows any training prompts with large losses) | |
def compute_metrics(pred): | |
logits = pred.predictions[0] if isinstance(pred.predictions, tuple) else pred.predictions | |
preds = logits.argmax(-1) | |
labels = pred.label_ids | |
accuracy = (preds == labels).astype(float).mean() | |
return {"accuracy": accuracy} | |
training_args = TrainingArguments( | |
output_dir='./results', | |
num_train_epochs=epochs_to_run, | |
per_device_train_batch_size=batch_size_for_trainer, | |
per_device_eval_batch_size=batch_size_for_trainer, | |
warmup_steps=500, | |
learning_rate=2e-5, | |
weight_decay=0.02, | |
logging_dir='./logs', | |
logging_steps=10, | |
evaluation_strategy="epoch", | |
) | |
# notice the bias_non_float in next line (it is given a value at top of code) | |
# class_weights = torch.tensor([1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,bias_non_fleet,1.0,1.0]) # Replace with your actual class weights | |
# class_weights = class_weights.to('cuda' if torch.cuda.is_available() else 'cpu') | |
# This is needed b/c loss_fn is swapped out in order to use weighted loss | |
# Any class weights that are not equal to one will make the model more (if greater than one) or less (if less than one)sensitive to given label | |
class CustomTrainer(Trainer): | |
def compute_loss(self, model, inputs, return_outputs=False): | |
labels = inputs.get("labels") | |
outputs = model(**inputs) | |
logits = outputs.get("logits") | |
# Use cross-entropy loss with class weights | |
# loss_fn = torch.nn.CrossEntropyLoss(weight=class_weights) | |
loss_fn = torch.nn.CrossEntropyLoss() | |
loss = loss_fn(logits, labels) | |
return (loss, outputs) if return_outputs else loss | |
# trainer = CustomTrainer( | |
# model=model, | |
# compute_metrics=compute_metrics, | |
# args=training_args, | |
# train_dataset=emotions_encoded["train"], | |
# eval_dataset=emotions_encoded["validation"], | |
# tokenizer=tokenizer ) | |
trainer = Trainer( | |
model=model, | |
args=training_args, | |
train_dataset=emotions_encoded["train"], | |
eval_dataset=emotions_encoded["validation"], | |
tokenizer=tokenizer | |
) | |
# Train the model and set timer to measure the training time | |
start_time = time.time() | |
trainer.train() | |
end_time = time.time() | |
execution_time = end_time - start_time | |
print(f"Execution Time: {execution_time:.2f} seconds") | |
# send validation prompts through the model - will be used in error-analysis matrix below | |
preds_output = trainer.predict(emotions_encoded["validation"]) | |
#################This section creates a error analysis matrix | |
# Extract the logits from the predictions output | |
logits = preds_output.predictions[0] if isinstance(preds_output.predictions, tuple) else preds_output.predictions | |
# Get the predicted class by applying argmax on the logits | |
y_preds = np.argmax(logits, axis=1) #prediction | |
y_valid = np.array(emotions_encoded["validation"]["label"]) #labels | |
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay | |
import matplotlib.pyplot as plt | |
import numpy as np | |
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix | |
#num_labels2 = len(label_mapping) | |
print("Ypreds and valids shape") | |
print(y_preds.shape, y_valid.shape) | |
# Define the function to plot the confusion matrix | |
def plot_confusion_matrix_with_text_labels(y_preds, y_true, labels): | |
# Compute confusion matrix | |
cm = confusion_matrix(y_true, y_preds,normalize="true") | |
# Plot confusion matrix | |
fig, ax = plt.subplots(figsize=(len(labels), len(labels))) | |
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels) | |
disp.plot(cmap="Blues", values_format=".2f", ax=ax, colorbar=False) | |
# Rotate the x-axis labels to prevent overlap | |
plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor") | |
# Ensure the plot is displayed | |
plt.title("Normalized Confusion Matrix with Text Labels") | |
plt.tight_layout() | |
plt.savefig("confusion_matrix.png") | |
plt.show() | |
# Get unique labels for validation data only - this will be shown in the matrix | |
unique_labels = sorted(set(y_valid) | set(y_preds)) | |
id_to_label = {v: k for k, v in label_to_id.items()} | |
labels = [id_to_label[label] for label in unique_labels] | |
print ("unique_labels") | |
print(labels) | |
# Call the function with the correct labels | |
if(should_produce_eval_matrix == 1): | |
plot_confusion_matrix_with_text_labels(y_preds, y_valid, labels) | |
#the label mapping will be saved in the model - and retrieved by any other program using the model - | |
# for instance the pathway through this code used for inference only will retrieve this value | |
# (or like the Python program that measures poor accuracies) | |
model.config.label_mapping = label_mapping | |
# Save the model and tokenizer | |
model.save_pretrained(f"./{model_save_path}") | |
tokenizer.save_pretrained(f"./{model_save_path}") | |
#for push repository | |
repo_name = "Reyad-Ahmmed/hf-data-timeframe" | |
# Your repository name | |
api_token = os.getenv("hf_token") # Retrieve the API token from environment variable | |
if not api_token: | |
raise ValueError("API token not found. Please set the HF_API_TOKEN environment variable.") | |
# Create repository (if not already created) | |
api = HfApi() | |
create_repo(repo_id=repo_name, token=api_token, exist_ok=True) | |
# Upload the model and tokenizer to the Hugging Face repository | |
upload_folder( | |
folder_path=f"{model_save_path}", | |
path_in_repo=f"{model_save_path}", | |
repo_id=repo_name, | |
token=api_token, | |
commit_message="Push fleet model", | |
#overwrite=True # Force overwrite existing files | |
) | |
else: | |
print('Load Pre-trained') | |
model_save_path = f"./{model_save_path}" | |
tokenizer_save_path = f"./{model_save_path}" | |
# RobertaTokenizer.from_pretrained(model_save_path) | |
model = AutoModelForSequenceClassification.from_pretrained(model_save_path).to('cpu') | |
tokenizer = AutoTokenizer.from_pretrained(tokenizer_save_path) | |
#Define the label mappings (this must match the mapping used during training) | |
label_mapping = model.config.label_mapping | |
label_mapping_reverse = {value: key for key, value in label_mapping.items()} | |
#Function to classify user input | |
def classify_user_input(user_input): | |
while True: | |
# Tokenize and predict | |
input_encoding = tokenizer(user_input, padding=True, truncation=True, return_tensors="pt").to('cuda') | |
with torch.no_grad(): | |
#attention_mask = input_encoding['attention_mask'].clone() | |
# Modify the attention mask to emphasize certain key tokens | |
for idx, token_id in enumerate(input_encoding['input_ids'][0]): | |
word = tokenizer.decode([token_id]) | |
print(word) | |
#if word.strip() in ["point", "summarize", "oil", "maintenance"]: # Target key tokens | |
#attention_mask[0, idx] = 2 # Increase attention weight for these words | |
# else: | |
# attention_mask[0, idx] = 0 | |
#print (attention_mask) | |
#input_encoding['attention_mask'] = attention_mask | |
output = model(**input_encoding, output_hidden_states=True) | |
# print('start-logits') | |
# print(output.logits) | |
# print('end-logits') | |
#print(output) | |
attention = output.attentions # Get attention scores | |
#print('atten') | |
#print(attention) | |
# Apply softmax to get the probabilities (confidence scores) | |
probabilities = F.softmax(output.logits, dim=-1) | |
# tokens = tokenizer.convert_ids_to_tokens(input_encoding['input_ids'][0].cpu().numpy()) | |
# # Display the attention visualization | |
# input_text = tokenizer.convert_ids_to_tokens(input_encoding['input_ids'][0]) | |
prediction = torch.argmax(output.logits, dim=1).cpu().numpy() | |
# Map prediction back to label | |
print(prediction) | |
predicted_label = label_mapping_reverse[prediction[0]] | |
print(f"Predicted intent: {predicted_label}\n") | |
# Print the confidence for each label | |
print("\nLabel Confidence Scores:") | |
for i, label in label_mapping_reverse.items(): | |
confidence = probabilities[0][i].item() # Get confidence score for each label | |
print(f"{label}: {confidence:.4f}") | |
print("\n") | |
iface = gr.Interface(fn=classify_user_input, inputs="text", outputs="text") | |
iface.launch(share=True) | |