Spaces:
Runtime error
Runtime error
File size: 10,040 Bytes
99870b0 764568f ef44888 f755e44 764568f 982ebc5 5052ec3 982ebc5 764568f fe6b01e 5052ec3 ac029b5 d3581ea 99870b0 ac029b5 99870b0 04be667 99870b0 31e1abc 99870b0 31e1abc 99870b0 a2c2250 99870b0 2e6b2e9 99870b0 a2c2250 99870b0 a7badb5 a2c2250 f755e44 a2c2250 f755e44 a2c2250 f755e44 a2c2250 a7badb5 99870b0 04b6cfd 92f652a 99870b0 93aabbe 99870b0 8ebb139 722a491 99870b0 722a491 99870b0 722a491 99870b0 722a491 99870b0 722a491 99870b0 f35a1a0 722a491 f35a1a0 722a491 e95f6ce 99870b0 7d788da 57c9642 247f0df 99870b0 247f0df 99870b0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 |
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from transformers import RobertaTokenizer, RobertaForSequenceClassification
import pandas as pd
#from sklearn.linear_model import LogisticRegression
#from sklearn.metrics import accuracy_score, confusion_matrix
#import matplotlib.pyplot as plt
import seaborn as sns
#import numpy as np
import sys
import torch.nn.functional as F
#from torch.nn import CrossEntropyLoss
#from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import json
import gradio as gr
from huggingface_hub import HfApi, upload_folder, create_repo
import os
# Load configuration file
with open('config.json', 'r') as config_file:
config = json.load(config_file)
num_args = len(config)
arg1 = config.get('arg1', 'default_value1')
arg2 = config.get('arg2', 'default_value2')
print(f"Argument 1: {arg1}")
print(f"Argument 2: {arg2}")
print(f"Total argument size: {num_args}")
if num_args > 1:
# sys.argv[0] is the script name, sys.argv[1] is the first argument, etc.
runModel = arg1
print(f"Passed value: {runModel}")
print (arg2)
else:
print("No argument was passed.")
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
modelNameToUse = arg2
if (runModel=='1'):
dataFileName = arg2 + '.csv'
print (dataFileName)
# Load the data from the CSV file
df = pd.read_csv(dataFileName)
# Access the text and labels
texts = df['text'].tolist()
labels = df['label'].tolist()
print('Train Model')
# Encode the labels
sorted_labels = sorted(df['label'].unique())
label_mapping = {label: i for i, label in enumerate(sorted_labels)}
df['label'] = df['label'].map(label_mapping)
print(df['label'])
# Train/test split
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
# Tokenization
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
# Model and training setup
model = RobertaForSequenceClassification.from_pretrained('roberta-base', output_attentions=True, num_labels=len(label_mapping)).to('cpu')
model.resize_token_embeddings(len(tokenizer))
train_encodings = tokenizer(list(train_df['text']), truncation=True, padding=True, max_length=64)
test_encodings = tokenizer(list(test_df['text']), truncation=True, padding=True, max_length=64)
# Dataset class
class IntentDataset(Dataset):
def __init__(self, encodings, labels):
self.encodings = encodings
self.labels = labels
def __getitem__(self, idx):
item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
label = self.labels[idx]
item['labels'] = torch.tensor(self.labels[idx])
return item
def __len__(self):
return len(self.labels)
train_dataset = IntentDataset(train_encodings, list(train_df['label']))
test_dataset = IntentDataset(test_encodings, list(test_df['label']))
# Your repository name
repo_name = "Reyad-Ahmmed/hf-data-timeframe"
api_token = os.getenv("HF_API_TOKEN") # Retrieve the API token from environment variable
if not api_token:
raise ValueError("API token not found. Please set the HF_API_TOKEN environment variable.")
# Create repository (if not already created)
api = HfApi()
create_repo(repo_id=repo_name, token=api_token, exist_ok=True)
# Create an instance of the custom loss function
training_args = TrainingArguments(
output_dir='./results_' + modelNameToUse,
num_train_epochs=2,
per_device_train_batch_size=2,
per_device_eval_batch_size=2,
warmup_steps=500,
weight_decay=0.02,
logging_dir='./logs_' + modelNameToUse,
logging_steps=10,
evaluation_strategy="epoch",
)
upload_folder(
folder_path=training_args.output_dir,
path_in_repo=f"{modelNameToUse}_results",
repo_id=repo_name,
token=api_token,
commit_message="Upload training results"
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=test_dataset
)
# Train the model
trainer.train()
# Evaluate the model
trainer.evaluate()
label_mapping = {
0: "lastmonth",
1: "nextweek",
2: "sevendays",
3: "today",
4: "tomorrow",
5: "yesterday"
}
def evaluate_and_report_errors(model, dataloader, tokenizer):
model.eval()
incorrect_predictions = []
with torch.no_grad():
#print(dataloader)
for batch in dataloader:
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
labels = batch['labels'].to(device)
outputs = model(input_ids=input_ids, attention_mask=attention_mask)
logits = outputs.logits
predictions = torch.argmax(logits, dim=1)
for i, prediction in enumerate(predictions):
if prediction != labels[i]:
incorrect_predictions.append({
"prompt": tokenizer.decode(input_ids[i], skip_special_tokens=True),
"predicted": prediction.item(),
"actual": labels[i].item()
})
# Print incorrect predictions
if incorrect_predictions:
print("\nIncorrect Predictions:")
for error in incorrect_predictions:
print(f"Sentence: {error['prompt']}")
#print(f"Predicted Label: {GetCategoryFromCategoryLong(error['predicted'])} | Actual Label: {GetCategoryFromCategoryLong(error['actual'])}\n")
print(f"Predicted Label: {label_mapping[error['predicted']]} | Actual Label: {label_mapping[error['actual']]}\n")
#print(f"Predicted Label: {error['predicted']} | Actual Label: {label_mapping[error['actual']]}\n")
else:
print("\nNo incorrect predictions found.")
train_dataloader = DataLoader(train_dataset, batch_size=10, shuffle=True)
evaluate_and_report_errors(model,train_dataloader, tokenizer)
# Save the model and tokenizer
#model.save_pretrained('./' + modelNameToUse + '_model')
#tokenizer.save_pretrained('./' + modelNameToUse + '_tokenizer')
# Save the model and tokenizer locally
local_model_path = "./data-timeframe_model"
local_tokenizer_path = "./data-timeframe_tokenizer"
model.save_pretrained(local_model_path)
tokenizer.save_pretrained(local_tokenizer_path)
# Upload the model and tokenizer to the Hugging Face repository
upload_folder(
folder_path=local_model_path,
path_in_repo="data-timeframe_model",
repo_id=repo_name,
token=api_token,
commit_message="Update fine-tuned model"
)
upload_folder(
folder_path=local_tokenizer_path,
path_in_repo="data-timeframe_tokenizer",
repo_id=repo_name,
token=api_token,
commit_message="Update fine-tuned tokenizer"
)
else:
print('Load Pre-trained')
#model_save_path = "./" + modelNameToUse + "_model"
#tokenizer_save_path = "./" + modelNameToUse + "_tokenizer"
model_name = "Reyad-Ahmmed/hf-data-timeframe"
# RobertaTokenizer.from_pretrained(model_save_path)
model = AutoModelForSequenceClassification.from_pretrained(model_name, subfolder="data-timeframe_model").to('cpu')
tokenizer = AutoTokenizer.from_pretrained(model_name, subfolder="data-timeframe_tokenizer")
#Define the label mappings (this must match the mapping used during training)
label_mapping = {
0: "lastmonth",
1: "nextweek",
2: "sevendays",
3: "today",
4: "tomorrow",
5: "yesterday"
}
#Function to classify user input
def classifyTimeFrame(user_input):
# Tokenize and predict
input_encoding = tokenizer(user_input, padding=True, truncation=True, return_tensors="pt").to('cpu')
with torch.no_grad():
attention_mask = input_encoding['attention_mask'].clone()
# Modify the attention mask to emphasize certain key tokens
# for idx, token_id in enumerate(input_encoding['input_ids'][0]):
# word = tokenizer.decode([token_id])
# print(word)
# if word.strip() in ["now", "same", "continue", "again", "also"]: # Target key tokens
# attention_mask[0, idx] = 3 # Increase attention weight for these words
# else:
# attention_mask[0, idx] = 0
# print (attention_mask)
# input_encoding['attention_mask'] = attention_mask
# print (input_encoding)
output = model(**input_encoding, output_hidden_states=True)
probabilities = F.softmax(output.logits, dim=-1)
prediction = torch.argmax(output.logits, dim=1).cpu().numpy()
# Map prediction back to label
print(prediction)
predicted_label = label_mapping[prediction[0]]
result = f"Predicted intent: {predicted_label}\n\n"
print(f"Predicted intent: {predicted_label}\n")
# Print the confidence for each label
print("\nLabel Confidence Scores:")
for i, label in label_mapping.items():
confidence = probabilities[0][i].item() # Get confidence score for each label
print(f"{label}: {confidence:.4f}")
result += f"{label}: {confidence:.4f}\n"
print("\n")
return result
iface = gr.Interface(fn=classifyTimeFrame, inputs="text", outputs="text")
iface.launch(share=True)
#Run the function
#classifyTimeFrame()
|