import pandas as pd from sklearn.model_selection import train_test_split from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments import torch from torch.utils.data import Dataset from torch.utils.data import DataLoader from transformers import RobertaTokenizer, RobertaForSequenceClassification import pandas as pd #from sklearn.linear_model import LogisticRegression #from sklearn.metrics import accuracy_score, confusion_matrix #import matplotlib.pyplot as plt import seaborn as sns #import numpy as np import sys import torch.nn.functional as F #from torch.nn import CrossEntropyLoss #from sklearn.decomposition import PCA import matplotlib.pyplot as plt import json import gradio as gr from huggingface_hub import HfApi, login, upload_folder, create_repo import os # Load configuration file with open('config.json', 'r') as config_file: config = json.load(config_file) num_args = len(config) arg1 = config.get('arg1', 'default_value1') arg2 = config.get('arg2', 'default_value2') print(f"Argument 1: {arg1}") print(f"Argument 2: {arg2}") print(f"Total argument size: {num_args}") if num_args > 1: # sys.argv[0] is the script name, sys.argv[1] is the first argument, etc. runModel = arg1 print(f"Passed value: {runModel}") print (arg2) else: print("No argument was passed.") device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') modelNameToUse = arg2 if (runModel=='1'): dataFileName = arg2 + '.csv' print (dataFileName) # Load the data from the CSV file df = pd.read_csv(dataFileName) # Access the text and labels texts = df['text'].tolist() labels = df['label'].tolist() print('Train Model') # Encode the labels sorted_labels = sorted(df['label'].unique()) label_mapping = {label: i for i, label in enumerate(sorted_labels)} df['label'] = df['label'].map(label_mapping) print(df['label']) # Train/test split train_df, test_df = train_test_split(df, test_size=0.2, random_state=42) # Tokenization tokenizer = RobertaTokenizer.from_pretrained('roberta-base') # Model and training setup model = RobertaForSequenceClassification.from_pretrained('roberta-base', output_attentions=True, num_labels=len(label_mapping)).to('cpu') model.resize_token_embeddings(len(tokenizer)) train_encodings = tokenizer(list(train_df['text']), truncation=True, padding=True, max_length=64) test_encodings = tokenizer(list(test_df['text']), truncation=True, padding=True, max_length=64) # Dataset class class IntentDataset(Dataset): def __init__(self, encodings, labels): self.encodings = encodings self.labels = labels def __getitem__(self, idx): item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()} label = self.labels[idx] item['labels'] = torch.tensor(self.labels[idx]) return item def __len__(self): return len(self.labels) train_dataset = IntentDataset(train_encodings, list(train_df['label'])) test_dataset = IntentDataset(test_encodings, list(test_df['label'])) token = os.getenv("hf_token") login(token=token) # Create an instance of the custom loss function training_args = TrainingArguments( output_dir='./results_' + modelNameToUse, num_train_epochs=8, per_device_train_batch_size=2, per_device_eval_batch_size=2, warmup_steps=500, weight_decay=0.02, logging_dir='./logs_' + modelNameToUse, logging_steps=10, evaluation_strategy="epoch", # Evaluation strategy is 'epoch' ) trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=test_dataset ) # Train the model trainer.train() # Evaluate the model trainer.evaluate() label_mapping = { 0: "lastmonth", 1: "nextweek", 2: "sevendays", 3: "today", 4: "tomorrow", 5: "yesterday" } def evaluate_and_report_errors(model, dataloader, tokenizer): model.eval() incorrect_predictions = [] with torch.no_grad(): #print(dataloader) for batch in dataloader: input_ids = batch['input_ids'].to(device) attention_mask = batch['attention_mask'].to(device) labels = batch['labels'].to(device) outputs = model(input_ids=input_ids, attention_mask=attention_mask) logits = outputs.logits predictions = torch.argmax(logits, dim=1) for i, prediction in enumerate(predictions): if prediction != labels[i]: incorrect_predictions.append({ "prompt": tokenizer.decode(input_ids[i], skip_special_tokens=True), "predicted": prediction.item(), "actual": labels[i].item() }) # Print incorrect predictions if incorrect_predictions: print("\nIncorrect Predictions:") for error in incorrect_predictions: print(f"Sentence: {error['prompt']}") #print(f"Predicted Label: {GetCategoryFromCategoryLong(error['predicted'])} | Actual Label: {GetCategoryFromCategoryLong(error['actual'])}\n") print(f"Predicted Label: {label_mapping[error['predicted']]} | Actual Label: {label_mapping[error['actual']]}\n") #print(f"Predicted Label: {error['predicted']} | Actual Label: {label_mapping[error['actual']]}\n") else: print("\nNo incorrect predictions found.") train_dataloader = DataLoader( train_dataset, batch_size=10, shuffle=True #num_workers=4 # Increase workers for faster data loading ) evaluate_and_report_errors(model,train_dataloader, tokenizer) model_path = './' + modelNameToUse + '_model' tokenizer_path = './' + modelNameToUse + '_tokenizer' if os.path.isdir(model_path) and os.path.isdir(tokenizer_path): print(f"The directory of model {model_path} exists.") print("Directory contents:", os.listdir(model_path)) print(f"The directory of tokenizer{tokenizer_path} exists.") print("Directory contents:", os.listdir(tokenizer_path)) else: print(f"The directory {model_path} does not exist. Creating it now...") print(f"The directory {tokenizer_path} does not exist. Creating it now...") os.makedirs(model_path, exist_ok=True) # Create the directory os.makedirs(tokenizer_path, exist_ok=True) # Create the directory print(f"Directory {model_path} created successfully.") print(f"Directory {tokenizer_path} created successfully.") # Save the model and tokenizer model.save_pretrained(model_path) tokenizer.save_pretrained(tokenizer_path) # Check for specific files in the model directory model_files = os.listdir(model_path) model_files = [file for file in model_files] print("Specific files in model directory:", model_files) # Check for specific files in the tokenizer directory tokenizer_files = os.listdir(tokenizer_path) tokenizer_files = [file for file in tokenizer_files] print("Specific files in tokenizer directory:", tokenizer_files) #for push repository repo_name = "Reyad-Ahmmed/hf-data-timeframe" # Your repository name api_token = os.getenv("hf_token") # Retrieve the API token from environment variable if not api_token: raise ValueError("API token not found. Please set the HF_API_TOKEN environment variable.") # Create repository (if not already created) api = HfApi() create_repo(repo_id=repo_name, token=api_token, exist_ok=True) # Upload the model and tokenizer to the Hugging Face repository upload_folder( folder_path=model_path, path_in_repo="data-timeframe_2_model", repo_id=repo_name, token=api_token, commit_message="Update fine-tuned model for test", #overwrite=True # Force overwrite existing files ) upload_folder( folder_path=tokenizer_path, path_in_repo="data-timeframe_2_tokenizer", repo_id=repo_name, token=api_token, commit_message="Update fine-tuned tokenizer", #overwrite=True # Force overwrite existing files ) tokenizer_files = os.listdir(tokenizer_path) tokenizer_files = [file for file in tokenizer_files] print("Specific files in tokenizer directory After Commit:", tokenizer_files) else: print('Load Pre-trained') #model_save_path = "./" + modelNameToUse + "_2_model" #tokenizer_save_path = "./" + modelNameToUse + "_2_tokenizer" model_name = "Reyad-Ahmmed/hf-data-timeframe" # RobertaTokenizer.from_pretrained(model_save_path) model = AutoModelForSequenceClassification.from_pretrained(model_name, subfolder="data-timeframe_2_model").to('cpu') tokenizer = AutoTokenizer.from_pretrained(model_name, subfolder="data-timeframe_2_tokenizer") #model = AutoModelForSequenceClassification.from_pretrained(model_save_path).to('cpu') #tokenizer = AutoTokenizer.from_pretrained(tokenizer_save_path) #Define the label mappings (this must match the mapping used during training) label_mapping = { 0: "lastmonth", 1: "nextweek", 2: "sevendays", 3: "today", 4: "tomorrow", 5: "yesterday" } #Function to classify user input def classifyTimeFrame(user_input): # Tokenize and predict input_encoding = tokenizer(user_input, padding=True, truncation=True, return_tensors="pt").to('cpu') with torch.no_grad(): attention_mask = input_encoding['attention_mask'].clone() # Modify the attention mask to emphasize certain key tokens # for idx, token_id in enumerate(input_encoding['input_ids'][0]): # word = tokenizer.decode([token_id]) # print(word) # if word.strip() in ["now", "same", "continue", "again", "also"]: # Target key tokens # attention_mask[0, idx] = 3 # Increase attention weight for these words # else: # attention_mask[0, idx] = 0 # print (attention_mask) # input_encoding['attention_mask'] = attention_mask # print (input_encoding) output = model(**input_encoding, output_hidden_states=True) probabilities = F.softmax(output.logits, dim=-1) prediction = torch.argmax(output.logits, dim=1).cpu().numpy() # Map prediction back to label print(prediction) predicted_label = label_mapping[prediction[0]] result = f"Predicted intent: {predicted_label}\n\n" print(f"Predicted intent: {predicted_label}\n") # Print the confidence for each label print("\nLabel Confidence Scores:") for i, label in label_mapping.items(): confidence = probabilities[0][i].item() # Get confidence score for each label print(f"{label}: {confidence:.4f}") result += f"{label}: {confidence:.4f}\n" print("\n") return result iface = gr.Interface(fn=classifyTimeFrame, inputs="text", outputs="text") iface.launch(share=True) #Run the function #classifyTimeFrame()