Spaces:
Runtime error
Runtime error
File size: 3,899 Bytes
d64149e a9dfd01 1e4ae57 b409f35 1e4ae57 d64149e 1e4ae57 d64149e a9dfd01 1e4ae57 d64149e 1e4ae57 d64149e 1e4ae57 22f20fb d64149e a9dfd01 d64149e 1e4ae57 d64149e 1e4ae57 d64149e a9dfd01 1e4ae57 d64149e 1e4ae57 d64149e 1e4ae57 18f5a4c d64149e 0d402f5 1e4ae57 d64149e 1e4ae57 a9dfd01 18f5a4c a9dfd01 1e4ae57 e7ed4e3 1432ebd 1e4ae57 18f5a4c 1e4ae57 d64149e 1e4ae57 d64149e 1432ebd a9dfd01 d64149e 1432ebd 1e4ae57 d64149e b409f35 a9dfd01 b409f35 1e4ae57 d64149e 1e4ae57 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 |
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import numpy as np
# Load your CSV file into a pandas DataFrame
df = pd.read_csv("dishTV_customer_service_with_address_and_rules_accurate_v2.csv")
# Print column names and first few rows to ensure data structure
print(df.columns)
print(df.head())
# Create a conversation column by merging the agent's and customer's utterances
df['Conversation'] = df['Agent Utterance'] + " " + df['Customer Utterance']
# Map labels for classification (Rule Followed, Question Asked, Question Answered)
df['Rule Followed'] = df['Rule Followed'].map({'Yes': 1, 'No': 0})
df['Question Asked'] = df['Question Asked'].map({'Yes': 1, 'No': 0})
df['Question Answered'] = df['Question Answered'].map({'Yes': 1, 'No': 0})
# Split data into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
df['Conversation'].tolist(),
df[['Rule Followed', 'Question Asked', 'Question Answered']].values,
test_size=0.2
)
# Initialize BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# Tokenize the conversations
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=128)
# Create PyTorch datasets
train_dataset = Dataset.from_dict({
'input_ids': train_encodings['input_ids'],
'attention_mask': train_encodings['attention_mask'],
'labels': torch.tensor(train_labels, dtype=torch.float32)
})
val_dataset = Dataset.from_dict({
'input_ids': val_encodings['input_ids'],
'attention_mask': val_encodings['attention_mask'],
'labels': torch.tensor(val_labels, dtype=torch.float32)
})
# Initialize the BERT model for multi-label classification (3 labels)
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)
# Define the training arguments
training_args = TrainingArguments(
output_dir='./results',
eval_strategy='epoch', # Evaluate at the end of each epoch
save_strategy='epoch', # Save model at the end of each epoch
learning_rate=2e-5,
per_device_train_batch_size=32,
per_device_eval_batch_size=32,
num_train_epochs=3,
weight_decay=0.01,
logging_dir='./logs',
logging_steps=500,
save_steps=1000, # Optional, you can keep this if you want to save every N steps (only used if save_strategy is 'steps')
load_best_model_at_end=True,
metric_for_best_model="accuracy",
do_train=True,
do_eval=True
)
# Trainer setup
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=val_dataset,
compute_metrics=lambda p: {
'accuracy': np.mean(np.all(p.predictions.argmax(axis=-1) == p.label_ids, axis=1))
}
)
# Start training
trainer.train()
# Evaluate the model
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")
# Define a new conversation for testing
new_conversation = ["Hello! How can I assist you today? I just wanted to check the status of my account."]
# Tokenize the new conversation
test_encodings = tokenizer(new_conversation, truncation=True, padding=True, max_length=512, return_tensors='pt')
# Make predictions
with torch.no_grad():
model.eval()
outputs = model(**test_encodings)
predictions = torch.sigmoid(outputs.logits).cpu().numpy() # Sigmoid for multi-label classification
# Display predictions
print(f"Predictions (Rule Followed, Question Asked, Question Answered): {predictions}")
# Round predictions (since we are doing binary classification for each label)
predictions_rounded = np.round(predictions)
print(f"Predictions (rounded): {predictions_rounded}")
|