HF_Python / app_old_2.py
Reyad-Ahmmed's picture
Rename app.py to app_old_2.py
caab6dc verified
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from transformers import RobertaTokenizer, RobertaForSequenceClassification
import pandas as pd
#from sklearn.linear_model import LogisticRegression
#from sklearn.metrics import accuracy_score, confusion_matrix
#import matplotlib.pyplot as plt
import seaborn as sns
#import numpy as np
import sys
import torch.nn.functional as F
#from torch.nn import CrossEntropyLoss
#from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import json
import gradio as gr
from huggingface_hub import HfApi, login, upload_folder, create_repo
import os
# Load configuration file
with open('config.json', 'r') as config_file:
config = json.load(config_file)
num_args = len(config)
arg1 = config.get('arg1', 'default_value1')
arg2 = config.get('arg2', 'default_value2')
print(f"Argument 1: {arg1}")
print(f"Argument 2: {arg2}")
print(f"Total argument size: {num_args}")
if num_args > 1:
# sys.argv[0] is the script name, sys.argv[1] is the first argument, etc.
runModel = arg1
print(f"Passed value: {runModel}")
print (arg2)
else:
print("No argument was passed.")
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
modelNameToUse = arg2
if (runModel=='1'):
dataFileName = arg2 + '.csv'
print (dataFileName)
# Load the data from the CSV file
df = pd.read_csv(dataFileName)
# Access the text and labels
texts = df['text'].tolist()
labels = df['label'].tolist()
print('Train Model')
# Encode the labels
sorted_labels = sorted(df['label'].unique())
label_mapping = {label: i for i, label in enumerate(sorted_labels)}
df['label'] = df['label'].map(label_mapping)
print(df['label'])
# Train/test split
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
# Tokenization
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
# Model and training setup
model = RobertaForSequenceClassification.from_pretrained('roberta-base', output_attentions=True, num_labels=len(label_mapping)).to('cpu')
model.resize_token_embeddings(len(tokenizer))
train_encodings = tokenizer(list(train_df['text']), truncation=True, padding=True, max_length=64)
test_encodings = tokenizer(list(test_df['text']), truncation=True, padding=True, max_length=64)
# Dataset class
class IntentDataset(Dataset):
def __init__(self, encodings, labels):
self.encodings = encodings
self.labels = labels
def __getitem__(self, idx):
item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
label = self.labels[idx]
item['labels'] = torch.tensor(self.labels[idx])
return item
def __len__(self):
return len(self.labels)
train_dataset = IntentDataset(train_encodings, list(train_df['label']))
test_dataset = IntentDataset(test_encodings, list(test_df['label']))
token = os.getenv("hf_token")
login(token=token)
# Create an instance of the custom loss function
training_args = TrainingArguments(
output_dir='./results_' + modelNameToUse,
num_train_epochs=8,
per_device_train_batch_size=2,
per_device_eval_batch_size=2,
warmup_steps=500,
weight_decay=0.02,
logging_dir='./logs_' + modelNameToUse,
logging_steps=10,
evaluation_strategy="epoch", # Evaluation strategy is 'epoch'
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=test_dataset
)
# Train the model
trainer.train()
# Evaluate the model
trainer.evaluate()
label_mapping = {
0: "lastmonth",
1: "nextweek",
2: "sevendays",
3: "today",
4: "tomorrow",
5: "yesterday"
}
def evaluate_and_report_errors(model, dataloader, tokenizer):
model.eval()
incorrect_predictions = []
with torch.no_grad():
#print(dataloader)
for batch in dataloader:
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
labels = batch['labels'].to(device)
outputs = model(input_ids=input_ids, attention_mask=attention_mask)
logits = outputs.logits
predictions = torch.argmax(logits, dim=1)
for i, prediction in enumerate(predictions):
if prediction != labels[i]:
incorrect_predictions.append({
"prompt": tokenizer.decode(input_ids[i], skip_special_tokens=True),
"predicted": prediction.item(),
"actual": labels[i].item()
})
# Print incorrect predictions
if incorrect_predictions:
print("\nIncorrect Predictions:")
for error in incorrect_predictions:
print(f"Sentence: {error['prompt']}")
#print(f"Predicted Label: {GetCategoryFromCategoryLong(error['predicted'])} | Actual Label: {GetCategoryFromCategoryLong(error['actual'])}\n")
print(f"Predicted Label: {label_mapping[error['predicted']]} | Actual Label: {label_mapping[error['actual']]}\n")
#print(f"Predicted Label: {error['predicted']} | Actual Label: {label_mapping[error['actual']]}\n")
else:
print("\nNo incorrect predictions found.")
train_dataloader = DataLoader(
train_dataset,
batch_size=10,
shuffle=True
#num_workers=4 # Increase workers for faster data loading
)
evaluate_and_report_errors(model,train_dataloader, tokenizer)
model_path = './' + modelNameToUse + '_model'
tokenizer_path = './' + modelNameToUse + '_tokenizer'
if os.path.isdir(model_path) and os.path.isdir(tokenizer_path):
print(f"The directory of model {model_path} exists.")
print("Directory contents:", os.listdir(model_path))
print(f"The directory of tokenizer{tokenizer_path} exists.")
print("Directory contents:", os.listdir(tokenizer_path))
else:
print(f"The directory {model_path} does not exist. Creating it now...")
print(f"The directory {tokenizer_path} does not exist. Creating it now...")
os.makedirs(model_path, exist_ok=True) # Create the directory
os.makedirs(tokenizer_path, exist_ok=True) # Create the directory
print(f"Directory {model_path} created successfully.")
print(f"Directory {tokenizer_path} created successfully.")
# Save the model and tokenizer
model.save_pretrained(model_path)
tokenizer.save_pretrained(tokenizer_path)
# Check for specific files in the model directory
model_files = os.listdir(model_path)
model_files = [file for file in model_files]
print("Specific files in model directory:", model_files)
# Check for specific files in the tokenizer directory
tokenizer_files = os.listdir(tokenizer_path)
tokenizer_files = [file for file in tokenizer_files]
print("Specific files in tokenizer directory:", tokenizer_files)
#for push repository
repo_name = "Reyad-Ahmmed/hf-data-timeframe"
# Your repository name
api_token = os.getenv("hf_token") # Retrieve the API token from environment variable
if not api_token:
raise ValueError("API token not found. Please set the HF_API_TOKEN environment variable.")
# Create repository (if not already created)
api = HfApi()
create_repo(repo_id=repo_name, token=api_token, exist_ok=True)
# Upload the model and tokenizer to the Hugging Face repository
upload_folder(
folder_path=model_path,
path_in_repo="data-timeframe_2_model",
repo_id=repo_name,
token=api_token,
commit_message="Update fine-tuned model for test",
#overwrite=True # Force overwrite existing files
)
upload_folder(
folder_path=tokenizer_path,
path_in_repo="data-timeframe_2_tokenizer",
repo_id=repo_name,
token=api_token,
commit_message="Update fine-tuned tokenizer",
#overwrite=True # Force overwrite existing files
)
tokenizer_files = os.listdir(tokenizer_path)
tokenizer_files = [file for file in tokenizer_files]
print("Specific files in tokenizer directory After Commit:", tokenizer_files)
else:
print('Load Pre-trained')
#model_save_path = "./" + modelNameToUse + "_2_model"
#tokenizer_save_path = "./" + modelNameToUse + "_2_tokenizer"
model_name = "Reyad-Ahmmed/hf-data-timeframe"
# RobertaTokenizer.from_pretrained(model_save_path)
model = AutoModelForSequenceClassification.from_pretrained(model_name, subfolder="data-timeframe_2_model").to('cpu')
tokenizer = AutoTokenizer.from_pretrained(model_name, subfolder="data-timeframe_2_tokenizer")
#model = AutoModelForSequenceClassification.from_pretrained(model_save_path).to('cpu')
#tokenizer = AutoTokenizer.from_pretrained(tokenizer_save_path)
#Define the label mappings (this must match the mapping used during training)
label_mapping = {
0: "lastmonth",
1: "nextweek",
2: "sevendays",
3: "today",
4: "tomorrow",
5: "yesterday"
}
#Function to classify user input
def classifyTimeFrame(user_input):
# Tokenize and predict
input_encoding = tokenizer(user_input, padding=True, truncation=True, return_tensors="pt").to('cpu')
with torch.no_grad():
attention_mask = input_encoding['attention_mask'].clone()
# Modify the attention mask to emphasize certain key tokens
# for idx, token_id in enumerate(input_encoding['input_ids'][0]):
# word = tokenizer.decode([token_id])
# print(word)
# if word.strip() in ["now", "same", "continue", "again", "also"]: # Target key tokens
# attention_mask[0, idx] = 3 # Increase attention weight for these words
# else:
# attention_mask[0, idx] = 0
# print (attention_mask)
# input_encoding['attention_mask'] = attention_mask
# print (input_encoding)
output = model(**input_encoding, output_hidden_states=True)
probabilities = F.softmax(output.logits, dim=-1)
prediction = torch.argmax(output.logits, dim=1).cpu().numpy()
# Map prediction back to label
print(prediction)
predicted_label = label_mapping[prediction[0]]
result = f"Predicted intent: {predicted_label}\n\n"
print(f"Predicted intent: {predicted_label}\n")
# Print the confidence for each label
print("\nLabel Confidence Scores:")
for i, label in label_mapping.items():
confidence = probabilities[0][i].item() # Get confidence score for each label
print(f"{label}: {confidence:.4f}")
result += f"{label}: {confidence:.4f}\n"
print("\n")
return result
iface = gr.Interface(fn=classifyTimeFrame, inputs="text", outputs="text")
iface.launch(share=True)
#Run the function
#classifyTimeFrame()