Spaces:
Runtime error
Runtime error
import pandas as pd | |
from sklearn.model_selection import train_test_split | |
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments | |
import torch | |
from torch.utils.data import Dataset | |
from torch.utils.data import DataLoader | |
from transformers import RobertaTokenizer, RobertaForSequenceClassification | |
import pandas as pd | |
#from sklearn.linear_model import LogisticRegression | |
#from sklearn.metrics import accuracy_score, confusion_matrix | |
#import matplotlib.pyplot as plt | |
import seaborn as sns | |
#import numpy as np | |
import sys | |
import torch.nn.functional as F | |
#from torch.nn import CrossEntropyLoss | |
#from sklearn.decomposition import PCA | |
import matplotlib.pyplot as plt | |
import json | |
import gradio as gr | |
from huggingface_hub import HfApi, login, upload_folder, create_repo | |
import os | |
# Load configuration file | |
with open('config.json', 'r') as config_file: | |
config = json.load(config_file) | |
num_args = len(config) | |
arg1 = config.get('arg1', 'default_value1') | |
arg2 = config.get('arg2', 'default_value2') | |
print(f"Argument 1: {arg1}") | |
print(f"Argument 2: {arg2}") | |
print(f"Total argument size: {num_args}") | |
if num_args > 1: | |
# sys.argv[0] is the script name, sys.argv[1] is the first argument, etc. | |
runModel = arg1 | |
print(f"Passed value: {runModel}") | |
print (arg2) | |
else: | |
print("No argument was passed.") | |
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') | |
modelNameToUse = arg2 | |
if (runModel=='1'): | |
dataFileName = arg2 + '.csv' | |
print (dataFileName) | |
# Load the data from the CSV file | |
df = pd.read_csv(dataFileName) | |
# Access the text and labels | |
texts = df['text'].tolist() | |
labels = df['label'].tolist() | |
print('Train Model') | |
# Encode the labels | |
sorted_labels = sorted(df['label'].unique()) | |
label_mapping = {label: i for i, label in enumerate(sorted_labels)} | |
df['label'] = df['label'].map(label_mapping) | |
print(df['label']) | |
# Train/test split | |
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42) | |
# Tokenization | |
tokenizer = RobertaTokenizer.from_pretrained('roberta-base') | |
# Model and training setup | |
model = RobertaForSequenceClassification.from_pretrained('roberta-base', output_attentions=True, num_labels=len(label_mapping)).to('cpu') | |
model.resize_token_embeddings(len(tokenizer)) | |
train_encodings = tokenizer(list(train_df['text']), truncation=True, padding=True, max_length=64) | |
test_encodings = tokenizer(list(test_df['text']), truncation=True, padding=True, max_length=64) | |
# Dataset class | |
class IntentDataset(Dataset): | |
def __init__(self, encodings, labels): | |
self.encodings = encodings | |
self.labels = labels | |
def __getitem__(self, idx): | |
item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()} | |
label = self.labels[idx] | |
item['labels'] = torch.tensor(self.labels[idx]) | |
return item | |
def __len__(self): | |
return len(self.labels) | |
train_dataset = IntentDataset(train_encodings, list(train_df['label'])) | |
test_dataset = IntentDataset(test_encodings, list(test_df['label'])) | |
token = os.getenv("hf_token") | |
login(token=token) | |
# Create an instance of the custom loss function | |
training_args = TrainingArguments( | |
output_dir='./results_' + modelNameToUse, | |
num_train_epochs=8, | |
per_device_train_batch_size=2, | |
per_device_eval_batch_size=2, | |
warmup_steps=500, | |
weight_decay=0.02, | |
logging_dir='./logs_' + modelNameToUse, | |
logging_steps=10, | |
evaluation_strategy="epoch", # Evaluation strategy is 'epoch' | |
) | |
trainer = Trainer( | |
model=model, | |
args=training_args, | |
train_dataset=train_dataset, | |
eval_dataset=test_dataset | |
) | |
# Train the model | |
trainer.train() | |
# Evaluate the model | |
trainer.evaluate() | |
label_mapping = { | |
0: "lastmonth", | |
1: "nextweek", | |
2: "sevendays", | |
3: "today", | |
4: "tomorrow", | |
5: "yesterday" | |
} | |
def evaluate_and_report_errors(model, dataloader, tokenizer): | |
model.eval() | |
incorrect_predictions = [] | |
with torch.no_grad(): | |
#print(dataloader) | |
for batch in dataloader: | |
input_ids = batch['input_ids'].to(device) | |
attention_mask = batch['attention_mask'].to(device) | |
labels = batch['labels'].to(device) | |
outputs = model(input_ids=input_ids, attention_mask=attention_mask) | |
logits = outputs.logits | |
predictions = torch.argmax(logits, dim=1) | |
for i, prediction in enumerate(predictions): | |
if prediction != labels[i]: | |
incorrect_predictions.append({ | |
"prompt": tokenizer.decode(input_ids[i], skip_special_tokens=True), | |
"predicted": prediction.item(), | |
"actual": labels[i].item() | |
}) | |
# Print incorrect predictions | |
if incorrect_predictions: | |
print("\nIncorrect Predictions:") | |
for error in incorrect_predictions: | |
print(f"Sentence: {error['prompt']}") | |
#print(f"Predicted Label: {GetCategoryFromCategoryLong(error['predicted'])} | Actual Label: {GetCategoryFromCategoryLong(error['actual'])}\n") | |
print(f"Predicted Label: {label_mapping[error['predicted']]} | Actual Label: {label_mapping[error['actual']]}\n") | |
#print(f"Predicted Label: {error['predicted']} | Actual Label: {label_mapping[error['actual']]}\n") | |
else: | |
print("\nNo incorrect predictions found.") | |
train_dataloader = DataLoader( | |
train_dataset, | |
batch_size=10, | |
shuffle=True | |
#num_workers=4 # Increase workers for faster data loading | |
) | |
evaluate_and_report_errors(model,train_dataloader, tokenizer) | |
model_path = './' + modelNameToUse + '_model' | |
tokenizer_path = './' + modelNameToUse + '_tokenizer' | |
if os.path.isdir(model_path) and os.path.isdir(tokenizer_path): | |
print(f"The directory of model {model_path} exists.") | |
print("Directory contents:", os.listdir(model_path)) | |
print(f"The directory of tokenizer{tokenizer_path} exists.") | |
print("Directory contents:", os.listdir(tokenizer_path)) | |
else: | |
print(f"The directory {model_path} does not exist. Creating it now...") | |
print(f"The directory {tokenizer_path} does not exist. Creating it now...") | |
os.makedirs(model_path, exist_ok=True) # Create the directory | |
os.makedirs(tokenizer_path, exist_ok=True) # Create the directory | |
print(f"Directory {model_path} created successfully.") | |
print(f"Directory {tokenizer_path} created successfully.") | |
# Save the model and tokenizer | |
model.save_pretrained(model_path) | |
tokenizer.save_pretrained(tokenizer_path) | |
# Check for specific files in the model directory | |
model_files = os.listdir(model_path) | |
model_files = [file for file in model_files] | |
print("Specific files in model directory:", model_files) | |
# Check for specific files in the tokenizer directory | |
tokenizer_files = os.listdir(tokenizer_path) | |
tokenizer_files = [file for file in tokenizer_files] | |
print("Specific files in tokenizer directory:", tokenizer_files) | |
#for push repository | |
repo_name = "Reyad-Ahmmed/hf-data-timeframe" | |
# Your repository name | |
api_token = os.getenv("hf_token") # Retrieve the API token from environment variable | |
if not api_token: | |
raise ValueError("API token not found. Please set the HF_API_TOKEN environment variable.") | |
# Create repository (if not already created) | |
api = HfApi() | |
create_repo(repo_id=repo_name, token=api_token, exist_ok=True) | |
# Upload the model and tokenizer to the Hugging Face repository | |
upload_folder( | |
folder_path=model_path, | |
path_in_repo="data-timeframe_2_model", | |
repo_id=repo_name, | |
token=api_token, | |
commit_message="Update fine-tuned model for test", | |
#overwrite=True # Force overwrite existing files | |
) | |
upload_folder( | |
folder_path=tokenizer_path, | |
path_in_repo="data-timeframe_2_tokenizer", | |
repo_id=repo_name, | |
token=api_token, | |
commit_message="Update fine-tuned tokenizer", | |
#overwrite=True # Force overwrite existing files | |
) | |
tokenizer_files = os.listdir(tokenizer_path) | |
tokenizer_files = [file for file in tokenizer_files] | |
print("Specific files in tokenizer directory After Commit:", tokenizer_files) | |
else: | |
print('Load Pre-trained') | |
#model_save_path = "./" + modelNameToUse + "_2_model" | |
#tokenizer_save_path = "./" + modelNameToUse + "_2_tokenizer" | |
model_name = "Reyad-Ahmmed/hf-data-timeframe" | |
# RobertaTokenizer.from_pretrained(model_save_path) | |
model = AutoModelForSequenceClassification.from_pretrained(model_name, subfolder="data-timeframe_2_model").to('cpu') | |
tokenizer = AutoTokenizer.from_pretrained(model_name, subfolder="data-timeframe_2_tokenizer") | |
#model = AutoModelForSequenceClassification.from_pretrained(model_save_path).to('cpu') | |
#tokenizer = AutoTokenizer.from_pretrained(tokenizer_save_path) | |
#Define the label mappings (this must match the mapping used during training) | |
label_mapping = { | |
0: "lastmonth", | |
1: "nextweek", | |
2: "sevendays", | |
3: "today", | |
4: "tomorrow", | |
5: "yesterday" | |
} | |
#Function to classify user input | |
def classifyTimeFrame(user_input): | |
# Tokenize and predict | |
input_encoding = tokenizer(user_input, padding=True, truncation=True, return_tensors="pt").to('cpu') | |
with torch.no_grad(): | |
attention_mask = input_encoding['attention_mask'].clone() | |
# Modify the attention mask to emphasize certain key tokens | |
# for idx, token_id in enumerate(input_encoding['input_ids'][0]): | |
# word = tokenizer.decode([token_id]) | |
# print(word) | |
# if word.strip() in ["now", "same", "continue", "again", "also"]: # Target key tokens | |
# attention_mask[0, idx] = 3 # Increase attention weight for these words | |
# else: | |
# attention_mask[0, idx] = 0 | |
# print (attention_mask) | |
# input_encoding['attention_mask'] = attention_mask | |
# print (input_encoding) | |
output = model(**input_encoding, output_hidden_states=True) | |
probabilities = F.softmax(output.logits, dim=-1) | |
prediction = torch.argmax(output.logits, dim=1).cpu().numpy() | |
# Map prediction back to label | |
print(prediction) | |
predicted_label = label_mapping[prediction[0]] | |
result = f"Predicted intent: {predicted_label}\n\n" | |
print(f"Predicted intent: {predicted_label}\n") | |
# Print the confidence for each label | |
print("\nLabel Confidence Scores:") | |
for i, label in label_mapping.items(): | |
confidence = probabilities[0][i].item() # Get confidence score for each label | |
print(f"{label}: {confidence:.4f}") | |
result += f"{label}: {confidence:.4f}\n" | |
print("\n") | |
return result | |
iface = gr.Interface(fn=classifyTimeFrame, inputs="text", outputs="text") | |
iface.launch(share=True) | |
#Run the function | |
#classifyTimeFrame() | |