Spaces:

Reyad-Ahmmed
/

HF_Python

Runtime error

App Files Files Community

HF_Python / app.py

Reyad-Ahmmed

Update app.py

a117686 verified 4 days ago

raw

history blame contribute delete

16.3 kB

	#python hf-fine-tune-fleet-8.py 1 train_fleet test_fleet 1 1 saved_fleet_model

	import pandas as pd
	from sklearn.model_selection import train_test_split
	from transformers import BertTokenizer, BertForSequenceClassification, AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
	import torch
	from torch.utils.data import Dataset
	from torch.utils.data import DataLoader
	from transformers import RobertaTokenizer, RobertaForSequenceClassification
	import pandas as pd
	from sklearn.model_selection import train_test_split
	from sklearn.linear_model import LogisticRegression
	from sklearn.metrics import accuracy_score, confusion_matrix
	import matplotlib.pyplot as plt
	import seaborn as sns
	import numpy as np
	import sys
	import torch.nn.functional as F
	from torch.nn import CrossEntropyLoss
	from sklearn.decomposition import PCA
	import matplotlib.pyplot as plt
	import re
	from datasets import load_dataset, DatasetDict
	import time
	import pprint
	import json
	from huggingface_hub import HfApi, login, upload_folder, create_repo
	import os
	from flask import Flask, jsonify, request
	import requests
	from fetch_data import fetch_and_update_training_data
	import gradio as gr

	# Load configuration file
	with open('config.json', 'r') as config_file:
	config = json.load(config_file)

	num_args = len(config)


	arg1 = config.get('arg1', '1')
	arg2 = config.get('arg2', 'train_fleet')
	arg3 = config.get('arg3', 'test_fleet')
	arg4 = config.get('arg4', '1')
	arg5 = config.get('arg5', '1')
	arg6 = config.get('arg6', 'saved_fleet_model')
	arg7 = config.get('arg7', 'Model')

	if num_args == 7:
	# cmd args
	# sys.argv[0] is the script name, sys.argv[1] is the first argument, etc.
	should_train_model = arg1 # should train model?
	train_file = arg2 # training file name
	test_file = arg3 # eval file name
	batch_size_for_trainer = int(arg4) # batch sizes to send to trainer
	should_produce_eval_matrix = int(arg5) # should produce matrix?
	path_to_save_trained_model_to = arg6

	else:
	print(f"Only {num_args-1} arguments after filename were passed out of 6")
	sys.exit()

	import os
	os.environ["CUDA_VISIBLE_DEVICES"] = "0" #only use 1 of my GPS (in case very weak ones are installed which would slow the training down)
	device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')


	if (should_train_model=='1'): #train model

	#settings
	model_save_path = path_to_save_trained_model_to
	bias_non_fleet = 1.0
	epochs_to_run = 15

	file_path_train = train_file + ".csv"
	file_path_test = test_file + ".csv"

	# Read the CSV files into pandas DataFrames they will later by converted to DataTables and used to train and evaluate the model

	file_train_df = fetch_and_update_training_data(file_path_train)
	file_test_df = pd.read_csv(file_path_test)


	#combine dataframes to get all possible labels/classifications for both training and evaluating - to get all possible labels (intents)
	df = pd.concat([file_train_df, file_test_df], ignore_index=True)
	sorted_labels = sorted(df['label'].unique())


	#create labels map from unique sorted labels
	label_mapping = {label: i for i, label in enumerate(sorted_labels)}
	print("label mappings")
	print(label_mapping)

	repo_name = "Reyad-Ahmmed/hf-data-timeframe"

	# Tokenization - get Tokenizer for roberta-base (must match model - also roberta-base)
	# tokenizer = BertTokenizer.from_pretrained('./mitra_ai_fleet_bert_tokenizer')
	tokenizer = BertTokenizer.from_pretrained(repo_name, subfolder="bert_embeddings_finetune")
	# I made sure to add all the ones in the training and eval data to this list
	# since we are training using data that only contains the left tag - we don't need right tags added to this list
	new_tokens = ['<EMPLOYEE_FIRST_NAME>', '<EMPLOYEE_LAST_NAME>','<POINT_ADDRESS>', '<TRUCK_NAME>', '<POINT_CLASS_NAME>', '<POINT_NAME>', '<TRUCK_CLASS_NAME>', '<TRUCK_STATUS_NAME>]']
	tokenizer.add_tokens(new_tokens)


	# Model
	model = BertForSequenceClassification.from_pretrained(repo_name, subfolder="bert_embeddings_finetune", output_attentions=True, num_labels=len(label_mapping), output_hidden_states=True).to('cuda')
	# model = BertForSequenceClassification.from_pretrained('./mitra_ai_fleet_bert', output_attentions=True, num_labels=len(label_mapping), output_hidden_states=True).to('cuda')


	# Reset tokenizer size to include the new size after adding the tags to the tokenizer's tokens
	model.resize_token_embeddings(len(tokenizer))

	#important_tokens = ["Acura-New", "TR-9012", "TR-NEW-02"]

	from datasets import Dataset, DatasetDict
	from sklearn.model_selection import train_test_split

	# Step 2: Convert string labels to integers
	# Create a mapping from unique labels (strings) to integers
	label_to_id = {label: idx for idx, label in enumerate(sorted(df["label"].unique()))}
	print(label_to_id)

	# Dataframes contain prompts and label names
	print('before converting labels to labelIds')
	pprint.pp(file_train_df)
	pprint.pp(file_test_df)

	# Apply the mapping to the labels to id (will swap out the label names with label id to the dataframes)
	file_train_df["label"] = file_train_df["label"].map(label_to_id)
	file_test_df["label"] = file_test_df["label"].map(label_to_id)

	print('after swapping out label names with Ids')
	pprint.pp(file_train_df)
	pprint.pp(file_test_df)

	# Step 3: Convert both dataframes to dictionaries
	emotions_dict_train = {"text": file_train_df["text"].tolist(), "label": file_train_df["label"].tolist()}
	emotions_dict_test = {"text": file_test_df["text"].tolist(), "label": file_test_df["label"].tolist()}

	print('dictionaries')
	pprint.pp(emotions_dict_train)
	pprint.pp(emotions_dict_test)

	# convert dictionaries to datasets
	emotions_dataset_train = Dataset.from_dict(emotions_dict_train)
	emotions_dataset_test = Dataset.from_dict(emotions_dict_test)



	# Step 4: Split dataset into train and validation
	# Create top level dictionary with both datasets (will contain two keys: one for "train" whose value is the training dataset
	# and one for "validation" with test dataset)
	emotions_encoded = DatasetDict({
	'train': emotions_dataset_train,
	'validation': emotions_dataset_test
	})


	# Define the tokenize function
	def tokenize(batch):
	return tokenizer(batch["text"], padding=True, truncation=True)


	# Apply tokenization by mapping the entire dataset (both training and validation) to tokenizer function
	# this will add the "input_id" and "attention_mask" columns
	emotions_encoded = emotions_encoded.map(tokenize, batched=True)
	emotions_encoded.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

	# Set the model to evaluation mode (this line does not run any training or eval)
	model.eval()
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	model.to(device)

	from sklearn.metrics import accuracy_score, f1_score

	# Define additional compute_metrics (used as part of error-analysis - produces "accuracy" metric which can be used in another program
	# that shows any training prompts with large losses)
	def compute_metrics(pred):
	logits = pred.predictions[0] if isinstance(pred.predictions, tuple) else pred.predictions
	preds = logits.argmax(-1)
	labels = pred.label_ids
	accuracy = (preds == labels).astype(float).mean()
	return {"accuracy": accuracy}


	training_args = TrainingArguments(
	output_dir='./results',
	num_train_epochs=epochs_to_run,
	per_device_train_batch_size=batch_size_for_trainer,
	per_device_eval_batch_size=batch_size_for_trainer,
	warmup_steps=500,
	learning_rate=2e-5,
	weight_decay=0.02,
	logging_dir='./logs',
	logging_steps=10,
	evaluation_strategy="epoch",
	)

	# notice the bias_non_float in next line (it is given a value at top of code)
	# class_weights = torch.tensor([1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,bias_non_fleet,1.0,1.0]) # Replace with your actual class weights
	# class_weights = class_weights.to('cuda' if torch.cuda.is_available() else 'cpu')

	# This is needed b/c loss_fn is swapped out in order to use weighted loss
	# Any class weights that are not equal to one will make the model more (if greater than one) or less (if less than one)sensitive to given label
	class CustomTrainer(Trainer):
	def compute_loss(self, model, inputs, return_outputs=False):
	labels = inputs.get("labels")
	outputs = model(**inputs)
	logits = outputs.get("logits")

	# Use cross-entropy loss with class weights
	# loss_fn = torch.nn.CrossEntropyLoss(weight=class_weights)
	loss_fn = torch.nn.CrossEntropyLoss()
	loss = loss_fn(logits, labels)

	return (loss, outputs) if return_outputs else loss


	# trainer = CustomTrainer(
	# model=model,
	# compute_metrics=compute_metrics,
	# args=training_args,
	# train_dataset=emotions_encoded["train"],
	# eval_dataset=emotions_encoded["validation"],
	# tokenizer=tokenizer )

	trainer = Trainer(
	model=model,
	args=training_args,
	train_dataset=emotions_encoded["train"],
	eval_dataset=emotions_encoded["validation"],
	tokenizer=tokenizer
	)

	# Train the model and set timer to measure the training time
	start_time = time.time()
	trainer.train()
	end_time = time.time()
	execution_time = end_time - start_time

	print(f"Execution Time: {execution_time:.2f} seconds")

	# send validation prompts through the model - will be used in error-analysis matrix below
	preds_output = trainer.predict(emotions_encoded["validation"])


	#################This section creates a error analysis matrix
	# Extract the logits from the predictions output
	logits = preds_output.predictions[0] if isinstance(preds_output.predictions, tuple) else preds_output.predictions

	# Get the predicted class by applying argmax on the logits
	y_preds = np.argmax(logits, axis=1) #prediction
	y_valid = np.array(emotions_encoded["validation"]["label"]) #labels


	from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
	import matplotlib.pyplot as plt
	import numpy as np

	from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
	#num_labels2 = len(label_mapping)

	print("Ypreds and valids shape")
	print(y_preds.shape, y_valid.shape)


	# Define the function to plot the confusion matrix
	def plot_confusion_matrix_with_text_labels(y_preds, y_true, labels):

	# Compute confusion matrix
	cm = confusion_matrix(y_true, y_preds,normalize="true")

	# Plot confusion matrix
	fig, ax = plt.subplots(figsize=(len(labels), len(labels)))
	disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
	disp.plot(cmap="Blues", values_format=".2f", ax=ax, colorbar=False)

	# Rotate the x-axis labels to prevent overlap
	plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")

	# Ensure the plot is displayed
	plt.title("Normalized Confusion Matrix with Text Labels")
	plt.tight_layout()
	plt.savefig("confusion_matrix.png")
	plt.show()



	# Get unique labels for validation data only - this will be shown in the matrix
	unique_labels = sorted(set(y_valid) \| set(y_preds))
	id_to_label = {v: k for k, v in label_to_id.items()}
	labels = [id_to_label[label] for label in unique_labels]

	print ("unique_labels")
	print(labels)

	# Call the function with the correct labels
	if(should_produce_eval_matrix == 1):
	plot_confusion_matrix_with_text_labels(y_preds, y_valid, labels)

	#the label mapping will be saved in the model - and retrieved by any other program using the model -
	# for instance the pathway through this code used for inference only will retrieve this value
	# (or like the Python program that measures poor accuracies)
	model.config.label_mapping = label_mapping

	# Save the model and tokenizer
	model.save_pretrained(f"./{model_save_path}")
	tokenizer.save_pretrained(f"./{model_save_path}")

	#for push repository
	repo_name = "Reyad-Ahmmed/hf-data-timeframe"

	# Your repository name
	api_token = os.getenv("hf_token") # Retrieve the API token from environment variable

	if not api_token:
	raise ValueError("API token not found. Please set the HF_API_TOKEN environment variable.")

	# Create repository (if not already created)
	api = HfApi()
	create_repo(repo_id=repo_name, token=api_token, exist_ok=True)

	# Upload the model and tokenizer to the Hugging Face repository

	upload_folder(
	folder_path=f"{model_save_path}",
	path_in_repo=f"{model_save_path}",
	repo_id=repo_name,
	token=api_token,
	commit_message="Push fleet model",
	#overwrite=True # Force overwrite existing files
	)

	else:
	print('Load Pre-trained')
	model_save_path = f"./{model_save_path}"
	tokenizer_save_path = f"./{model_save_path}"

	# RobertaTokenizer.from_pretrained(model_save_path)
	model = AutoModelForSequenceClassification.from_pretrained(model_save_path).to('cpu')
	tokenizer = AutoTokenizer.from_pretrained(tokenizer_save_path)

	#Define the label mappings (this must match the mapping used during training)
	label_mapping = model.config.label_mapping
	label_mapping_reverse = {value: key for key, value in label_mapping.items()}

	#Function to classify user input
	def classify_user_input(user_input):
	while True:

	# Tokenize and predict
	input_encoding = tokenizer(user_input, padding=True, truncation=True, return_tensors="pt").to('cuda')

	with torch.no_grad():
	#attention_mask = input_encoding['attention_mask'].clone()

	# Modify the attention mask to emphasize certain key tokens
	for idx, token_id in enumerate(input_encoding['input_ids'][0]):
	word = tokenizer.decode([token_id])
	print(word)
	#if word.strip() in ["point", "summarize", "oil", "maintenance"]: # Target key tokens
	#attention_mask[0, idx] = 2 # Increase attention weight for these words
	# else:
	# attention_mask[0, idx] = 0
	#print (attention_mask)
	#input_encoding['attention_mask'] = attention_mask
	output = model(**input_encoding, output_hidden_states=True)
	# print('start-logits')
	# print(output.logits)
	# print('end-logits')
	#print(output)
	attention = output.attentions # Get attention scores
	#print('atten')
	#print(attention)
	# Apply softmax to get the probabilities (confidence scores)
	probabilities = F.softmax(output.logits, dim=-1)

	# tokens = tokenizer.convert_ids_to_tokens(input_encoding['input_ids'][0].cpu().numpy())
	# # Display the attention visualization
	# input_text = tokenizer.convert_ids_to_tokens(input_encoding['input_ids'][0])

	prediction = torch.argmax(output.logits, dim=1).cpu().numpy()

	# Map prediction back to label
	print(prediction)
	predicted_label = label_mapping_reverse[prediction[0]]


	print(f"Predicted intent: {predicted_label}\n")
	# Print the confidence for each label
	print("\nLabel Confidence Scores:")
	for i, label in label_mapping_reverse.items():
	confidence = probabilities[0][i].item() # Get confidence score for each label
	print(f"{label}: {confidence:.4f}")
	print("\n")


	iface = gr.Interface(fn=classify_user_input, inputs="text", outputs="text")
	iface.launch(share=True)