Spaces:

Reyad-Ahmmed
/

HF_Python

Sleeping

File size: 11,526 Bytes

import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from transformers import RobertaTokenizer, RobertaForSequenceClassification
import pandas as pd

#from sklearn.linear_model import LogisticRegression
#from sklearn.metrics import accuracy_score, confusion_matrix
#import matplotlib.pyplot as plt
import seaborn as sns
#import numpy as np
import sys
import torch.nn.functional as F
#from torch.nn import CrossEntropyLoss
#from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import json
import gradio as gr
from huggingface_hub import HfApi, login, upload_folder, create_repo
import os

# Load configuration file 
with open('config.json', 'r') as config_file:
    config = json.load(config_file) 

num_args = len(config)

arg1 = config.get('arg1', 'default_value1') 
arg2 = config.get('arg2', 'default_value2') 

print(f"Argument 1: {arg1}") 
print(f"Argument 2: {arg2}")
print(f"Total argument size: {num_args}")

if num_args > 1:
    # sys.argv[0] is the script name, sys.argv[1] is the first argument, etc.
    runModel = arg1
    print(f"Passed value: {runModel}")
    print (arg2)  
else:
    print("No argument was passed.")

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
modelNameToUse = arg2

if (runModel=='1'):
    dataFileName = arg2 + '.csv'
    print (dataFileName)
    # Load the data from the CSV file
    df = pd.read_csv(dataFileName)
    # Access the text and labels
    texts = df['text'].tolist()
    labels = df['label'].tolist()

    print('Train Model')
     # Encode the labels
    sorted_labels = sorted(df['label'].unique())
    label_mapping = {label: i for i, label in enumerate(sorted_labels)}
    df['label'] = df['label'].map(label_mapping)
    print(df['label'])
    # Train/test split
    train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

    # Tokenization
    tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

    # Model and training setup
    model = RobertaForSequenceClassification.from_pretrained('roberta-base', output_attentions=True, num_labels=len(label_mapping)).to('cpu')

    model.resize_token_embeddings(len(tokenizer))

    train_encodings = tokenizer(list(train_df['text']), truncation=True, padding=True, max_length=64)
    test_encodings = tokenizer(list(test_df['text']), truncation=True, padding=True, max_length=64)

    # Dataset class
    class IntentDataset(Dataset):
        def __init__(self, encodings, labels):
            
            self.encodings = encodings
            self.labels = labels

        def __getitem__(self, idx):
            item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
            label = self.labels[idx]
            item['labels'] = torch.tensor(self.labels[idx])
   

            return item

        def __len__(self):
            return len(self.labels)

    train_dataset = IntentDataset(train_encodings, list(train_df['label']))
    test_dataset = IntentDataset(test_encodings, list(test_df['label']))

 
    token = os.getenv("hf_token")
    login(token=token)
    
    
    # Create an instance of the custom loss function
    training_args = TrainingArguments(
        output_dir='./results_' + modelNameToUse,
        num_train_epochs=8,
        per_device_train_batch_size=2,
        per_device_eval_batch_size=2,
        warmup_steps=500,
        weight_decay=0.02,
        logging_dir='./logs_' + modelNameToUse,
        logging_steps=10,
        evaluation_strategy="epoch",  # Evaluation strategy is 'epoch'

    )
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset
    )

    # Train the model
    trainer.train()

    # Evaluate the model
    trainer.evaluate()

    label_mapping = {
        0: "lastmonth",
        1: "nextweek",
        2: "sevendays",
        3: "today",
        4: "tomorrow",
        5: "yesterday"
    }

    def evaluate_and_report_errors(model, dataloader, tokenizer):
        model.eval()
        incorrect_predictions = []
        with torch.no_grad():
            #print(dataloader)
            for batch in dataloader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)
                outputs = model(input_ids=input_ids, attention_mask=attention_mask)
                logits = outputs.logits
                predictions = torch.argmax(logits, dim=1)

                for i, prediction in enumerate(predictions):
                    if prediction != labels[i]:
                        incorrect_predictions.append({
                            "prompt": tokenizer.decode(input_ids[i], skip_special_tokens=True),
                            "predicted": prediction.item(),
                            "actual": labels[i].item()
                        })

        # Print incorrect predictions
        if incorrect_predictions:
            print("\nIncorrect Predictions:")
            for error in incorrect_predictions:
                print(f"Sentence: {error['prompt']}")
                #print(f"Predicted Label: {GetCategoryFromCategoryLong(error['predicted'])} | Actual Label: {GetCategoryFromCategoryLong(error['actual'])}\n")
                print(f"Predicted Label: {label_mapping[error['predicted']]} | Actual Label: {label_mapping[error['actual']]}\n")
                #print(f"Predicted Label: {error['predicted']} | Actual Label: {label_mapping[error['actual']]}\n")
        else:
            print("\nNo incorrect predictions found.")

    train_dataloader = DataLoader(
        train_dataset, 
        batch_size=10, 
        shuffle=True
        #num_workers=4  # Increase workers for faster data loading
    )
    
    evaluate_and_report_errors(model,train_dataloader, tokenizer)

    model_path = './'  + modelNameToUse + '_model'
    tokenizer_path = './' + modelNameToUse + '_tokenizer'

    if os.path.isdir(model_path) and os.path.isdir(tokenizer_path):
        print(f"The directory of model {model_path} exists.")
        print("Directory contents:", os.listdir(model_path))

        print(f"The directory of tokenizer{tokenizer_path} exists.")
        print("Directory contents:", os.listdir(tokenizer_path))
    else:
        print(f"The directory {model_path} does not exist. Creating it now...")
        print(f"The directory {tokenizer_path} does not exist. Creating it now...")
        os.makedirs(model_path, exist_ok=True)  # Create the directory
        os.makedirs(tokenizer_path, exist_ok=True)  # Create the directory
        print(f"Directory {model_path} created successfully.")
        print(f"Directory {tokenizer_path} created successfully.")
        
    # Save the model and tokenizer
    model.save_pretrained(model_path)
    tokenizer.save_pretrained(tokenizer_path)

    # Check for specific files in the model directory
    model_files = os.listdir(model_path)
    model_files = [file for file in model_files]
    print("Specific files in model directory:", model_files)
    
    # Check for specific files in the tokenizer directory
    tokenizer_files = os.listdir(tokenizer_path)
    tokenizer_files = [file for file in tokenizer_files]
    print("Specific files in tokenizer directory:", tokenizer_files)

    #for push repository
    repo_name = "Reyad-Ahmmed/hf-data-timeframe" 

    # Your repository name 
    api_token = os.getenv("hf_token")  # Retrieve the API token from environment variable

    if not api_token:
        raise ValueError("API token not found. Please set the HF_API_TOKEN environment variable.")

    # Create repository (if not already created)
    api = HfApi()
    create_repo(repo_id=repo_name, token=api_token, exist_ok=True)

    # Upload the model and tokenizer to the Hugging Face repository
    
    upload_folder(
        folder_path=model_path,
        path_in_repo="data-timeframe_model",
        repo_id=repo_name,
        token=api_token,
        commit_message="Update fine-tuned model for test",
        #overwrite=True  # Force overwrite existing files
    )

    upload_folder(
        folder_path=tokenizer_path,
        path_in_repo="data-timeframe_tokenizer",
        repo_id=repo_name,
        token=api_token,
        commit_message="Update fine-tuned tokenizer",
        #overwrite=True  # Force overwrite existing files
    )
    
    tokenizer_files = os.listdir(tokenizer_path)
    tokenizer_files = [file for file in tokenizer_files]
    print("Specific files in tokenizer directory After Commit:", tokenizer_files)

else:
    print('Load Pre-trained')

    #model_save_path = "./" + modelNameToUse + "_model"
    #tokenizer_save_path = "./" + modelNameToUse + "_tokenizer"
    
    model_name = "Reyad-Ahmmed/hf-data-timeframe"

    # RobertaTokenizer.from_pretrained(model_save_path)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, subfolder="data-timeframe_model").to('cpu')
    tokenizer = AutoTokenizer.from_pretrained(model_name, subfolder="data-timeframe_tokenizer")

    #model = AutoModelForSequenceClassification.from_pretrained(model_save_path).to('cpu')
    #tokenizer = AutoTokenizer.from_pretrained(tokenizer_save_path)

#Define the label mappings (this must match the mapping used during training)
label_mapping = {
    0: "lastmonth",
    1: "nextweek",
    2: "sevendays",
    3: "today",
    4: "tomorrow",
    5: "yesterday"
}


#Function to classify user input
def classifyTimeFrame(user_input):
    # Tokenize and predict
    input_encoding = tokenizer(user_input, padding=True, truncation=True, return_tensors="pt").to('cpu')
    
    with torch.no_grad():
        attention_mask = input_encoding['attention_mask'].clone()



        # Modify the attention mask to emphasize certain key tokens
        # for idx, token_id in enumerate(input_encoding['input_ids'][0]):
        #     word = tokenizer.decode([token_id])
        #     print(word)
        #     if word.strip() in ["now", "same", "continue", "again", "also"]:  # Target key tokens
        #         attention_mask[0, idx] = 3  # Increase attention weight for these words
        #     else:
        #         attention_mask[0, idx] = 0 
        # print (attention_mask)
        # input_encoding['attention_mask'] = attention_mask   
        # print (input_encoding)
        output = model(**input_encoding, output_hidden_states=True)

        probabilities = F.softmax(output.logits, dim=-1)

        prediction = torch.argmax(output.logits, dim=1).cpu().numpy()

        # Map prediction back to label
        print(prediction)
        predicted_label = label_mapping[prediction[0]]

        result = f"Predicted intent: {predicted_label}\n\n"
        print(f"Predicted intent: {predicted_label}\n")
        # Print the confidence for each label
        print("\nLabel Confidence Scores:")
        for i, label in label_mapping.items():
            confidence = probabilities[0][i].item()  # Get confidence score for each label
            print(f"{label}: {confidence:.4f}")
            result += f"{label}: {confidence:.4f}\n"
        print("\n")
        return result

iface = gr.Interface(fn=classifyTimeFrame, inputs="text", outputs="text") 
iface.launch(share=True)

#Run the function
#classifyTimeFrame()