Spaces:
Runtime error
Runtime error
File size: 11,526 Bytes
99870b0 764568f ef44888 eeb6b15 f755e44 764568f 982ebc5 5052ec3 982ebc5 764568f fe6b01e 5052ec3 ac029b5 d3581ea 99870b0 ac029b5 99870b0 04be667 99870b0 31e1abc 99870b0 31e1abc 99870b0 0d61df1 a2c2250 36150aa 99870b0 b9a9b75 71eef6d 4fc50df 99870b0 698106d 5850bee a2c2250 2525b46 99870b0 ce350ed 99870b0 b497dbc 71eef6d b497dbc 99870b0 7cbb602 53a3e69 7cbb602 05d64ab 7cbb602 05d64ab 7cbb602 99870b0 7cbb602 a7badb5 cfa2743 04356f9 cfa2743 36150aa cfa2743 04356f9 cfa2743 b158209 c4695ef b158209 ebdc07d 41463ae f8beb3b 41463ae 52b0c2d b158209 487c638 b158209 f1cd61f f8beb3b b158209 41463ae 52b0c2d 99870b0 d2c2e96 511b9fc 60ed144 92f652a 99870b0 60ed144 99870b0 60ed144 489e0e4 99870b0 ce350ed 99870b0 8ebb139 722a491 99870b0 722a491 99870b0 722a491 99870b0 722a491 99870b0 722a491 99870b0 f35a1a0 722a491 f35a1a0 722a491 e95f6ce 99870b0 7d788da 57c9642 247f0df 99870b0 247f0df 99870b0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 |
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from transformers import RobertaTokenizer, RobertaForSequenceClassification
import pandas as pd
#from sklearn.linear_model import LogisticRegression
#from sklearn.metrics import accuracy_score, confusion_matrix
#import matplotlib.pyplot as plt
import seaborn as sns
#import numpy as np
import sys
import torch.nn.functional as F
#from torch.nn import CrossEntropyLoss
#from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import json
import gradio as gr
from huggingface_hub import HfApi, login, upload_folder, create_repo
import os
# Load configuration file
with open('config.json', 'r') as config_file:
config = json.load(config_file)
num_args = len(config)
arg1 = config.get('arg1', 'default_value1')
arg2 = config.get('arg2', 'default_value2')
print(f"Argument 1: {arg1}")
print(f"Argument 2: {arg2}")
print(f"Total argument size: {num_args}")
if num_args > 1:
# sys.argv[0] is the script name, sys.argv[1] is the first argument, etc.
runModel = arg1
print(f"Passed value: {runModel}")
print (arg2)
else:
print("No argument was passed.")
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
modelNameToUse = arg2
if (runModel=='1'):
dataFileName = arg2 + '.csv'
print (dataFileName)
# Load the data from the CSV file
df = pd.read_csv(dataFileName)
# Access the text and labels
texts = df['text'].tolist()
labels = df['label'].tolist()
print('Train Model')
# Encode the labels
sorted_labels = sorted(df['label'].unique())
label_mapping = {label: i for i, label in enumerate(sorted_labels)}
df['label'] = df['label'].map(label_mapping)
print(df['label'])
# Train/test split
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
# Tokenization
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
# Model and training setup
model = RobertaForSequenceClassification.from_pretrained('roberta-base', output_attentions=True, num_labels=len(label_mapping)).to('cpu')
model.resize_token_embeddings(len(tokenizer))
train_encodings = tokenizer(list(train_df['text']), truncation=True, padding=True, max_length=64)
test_encodings = tokenizer(list(test_df['text']), truncation=True, padding=True, max_length=64)
# Dataset class
class IntentDataset(Dataset):
def __init__(self, encodings, labels):
self.encodings = encodings
self.labels = labels
def __getitem__(self, idx):
item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
label = self.labels[idx]
item['labels'] = torch.tensor(self.labels[idx])
return item
def __len__(self):
return len(self.labels)
train_dataset = IntentDataset(train_encodings, list(train_df['label']))
test_dataset = IntentDataset(test_encodings, list(test_df['label']))
token = os.getenv("hf_token")
login(token=token)
# Create an instance of the custom loss function
training_args = TrainingArguments(
output_dir='./results_' + modelNameToUse,
num_train_epochs=8,
per_device_train_batch_size=2,
per_device_eval_batch_size=2,
warmup_steps=500,
weight_decay=0.02,
logging_dir='./logs_' + modelNameToUse,
logging_steps=10,
evaluation_strategy="epoch", # Evaluation strategy is 'epoch'
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=test_dataset
)
# Train the model
trainer.train()
# Evaluate the model
trainer.evaluate()
label_mapping = {
0: "lastmonth",
1: "nextweek",
2: "sevendays",
3: "today",
4: "tomorrow",
5: "yesterday"
}
def evaluate_and_report_errors(model, dataloader, tokenizer):
model.eval()
incorrect_predictions = []
with torch.no_grad():
#print(dataloader)
for batch in dataloader:
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
labels = batch['labels'].to(device)
outputs = model(input_ids=input_ids, attention_mask=attention_mask)
logits = outputs.logits
predictions = torch.argmax(logits, dim=1)
for i, prediction in enumerate(predictions):
if prediction != labels[i]:
incorrect_predictions.append({
"prompt": tokenizer.decode(input_ids[i], skip_special_tokens=True),
"predicted": prediction.item(),
"actual": labels[i].item()
})
# Print incorrect predictions
if incorrect_predictions:
print("\nIncorrect Predictions:")
for error in incorrect_predictions:
print(f"Sentence: {error['prompt']}")
#print(f"Predicted Label: {GetCategoryFromCategoryLong(error['predicted'])} | Actual Label: {GetCategoryFromCategoryLong(error['actual'])}\n")
print(f"Predicted Label: {label_mapping[error['predicted']]} | Actual Label: {label_mapping[error['actual']]}\n")
#print(f"Predicted Label: {error['predicted']} | Actual Label: {label_mapping[error['actual']]}\n")
else:
print("\nNo incorrect predictions found.")
train_dataloader = DataLoader(
train_dataset,
batch_size=10,
shuffle=True
#num_workers=4 # Increase workers for faster data loading
)
evaluate_and_report_errors(model,train_dataloader, tokenizer)
model_path = './' + modelNameToUse + '_model'
tokenizer_path = './' + modelNameToUse + '_tokenizer'
if os.path.isdir(model_path) and os.path.isdir(tokenizer_path):
print(f"The directory of model {model_path} exists.")
print("Directory contents:", os.listdir(model_path))
print(f"The directory of tokenizer{tokenizer_path} exists.")
print("Directory contents:", os.listdir(tokenizer_path))
else:
print(f"The directory {model_path} does not exist. Creating it now...")
print(f"The directory {tokenizer_path} does not exist. Creating it now...")
os.makedirs(model_path, exist_ok=True) # Create the directory
os.makedirs(tokenizer_path, exist_ok=True) # Create the directory
print(f"Directory {model_path} created successfully.")
print(f"Directory {tokenizer_path} created successfully.")
# Save the model and tokenizer
model.save_pretrained(model_path)
tokenizer.save_pretrained(tokenizer_path)
# Check for specific files in the model directory
model_files = os.listdir(model_path)
model_files = [file for file in model_files]
print("Specific files in model directory:", model_files)
# Check for specific files in the tokenizer directory
tokenizer_files = os.listdir(tokenizer_path)
tokenizer_files = [file for file in tokenizer_files]
print("Specific files in tokenizer directory:", tokenizer_files)
#for push repository
repo_name = "Reyad-Ahmmed/hf-data-timeframe"
# Your repository name
api_token = os.getenv("hf_token") # Retrieve the API token from environment variable
if not api_token:
raise ValueError("API token not found. Please set the HF_API_TOKEN environment variable.")
# Create repository (if not already created)
api = HfApi()
create_repo(repo_id=repo_name, token=api_token, exist_ok=True)
# Upload the model and tokenizer to the Hugging Face repository
upload_folder(
folder_path=model_path,
path_in_repo="data-timeframe_model",
repo_id=repo_name,
token=api_token,
commit_message="Update fine-tuned model for test",
#overwrite=True # Force overwrite existing files
)
upload_folder(
folder_path=tokenizer_path,
path_in_repo="data-timeframe_tokenizer",
repo_id=repo_name,
token=api_token,
commit_message="Update fine-tuned tokenizer",
#overwrite=True # Force overwrite existing files
)
tokenizer_files = os.listdir(tokenizer_path)
tokenizer_files = [file for file in tokenizer_files]
print("Specific files in tokenizer directory After Commit:", tokenizer_files)
else:
print('Load Pre-trained')
#model_save_path = "./" + modelNameToUse + "_model"
#tokenizer_save_path = "./" + modelNameToUse + "_tokenizer"
model_name = "Reyad-Ahmmed/hf-data-timeframe"
# RobertaTokenizer.from_pretrained(model_save_path)
model = AutoModelForSequenceClassification.from_pretrained(model_name, subfolder="data-timeframe_model").to('cpu')
tokenizer = AutoTokenizer.from_pretrained(model_name, subfolder="data-timeframe_tokenizer")
#model = AutoModelForSequenceClassification.from_pretrained(model_save_path).to('cpu')
#tokenizer = AutoTokenizer.from_pretrained(tokenizer_save_path)
#Define the label mappings (this must match the mapping used during training)
label_mapping = {
0: "lastmonth",
1: "nextweek",
2: "sevendays",
3: "today",
4: "tomorrow",
5: "yesterday"
}
#Function to classify user input
def classifyTimeFrame(user_input):
# Tokenize and predict
input_encoding = tokenizer(user_input, padding=True, truncation=True, return_tensors="pt").to('cpu')
with torch.no_grad():
attention_mask = input_encoding['attention_mask'].clone()
# Modify the attention mask to emphasize certain key tokens
# for idx, token_id in enumerate(input_encoding['input_ids'][0]):
# word = tokenizer.decode([token_id])
# print(word)
# if word.strip() in ["now", "same", "continue", "again", "also"]: # Target key tokens
# attention_mask[0, idx] = 3 # Increase attention weight for these words
# else:
# attention_mask[0, idx] = 0
# print (attention_mask)
# input_encoding['attention_mask'] = attention_mask
# print (input_encoding)
output = model(**input_encoding, output_hidden_states=True)
probabilities = F.softmax(output.logits, dim=-1)
prediction = torch.argmax(output.logits, dim=1).cpu().numpy()
# Map prediction back to label
print(prediction)
predicted_label = label_mapping[prediction[0]]
result = f"Predicted intent: {predicted_label}\n\n"
print(f"Predicted intent: {predicted_label}\n")
# Print the confidence for each label
print("\nLabel Confidence Scores:")
for i, label in label_mapping.items():
confidence = probabilities[0][i].item() # Get confidence score for each label
print(f"{label}: {confidence:.4f}")
result += f"{label}: {confidence:.4f}\n"
print("\n")
return result
iface = gr.Interface(fn=classifyTimeFrame, inputs="text", outputs="text")
iface.launch(share=True)
#Run the function
#classifyTimeFrame()
|