Spaces:
Runtime error
Runtime error
File size: 5,132 Bytes
a7284f0 d64149e b409f35 d64149e 1432ebd d64149e b409f35 d64149e 1432ebd d64149e 1432ebd d64149e e7ed4e3 1432ebd d64149e 1432ebd d64149e 1432ebd d64149e b409f35 d64149e b409f35 d64149e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 |
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import torch
from sklearn.metrics import accuracy_score
# Load the CSV data
df = pd.read_csv("dishTV_customer_service_with_address_and_rules_accurate_v2.csv")
# Clean the dataset by dropping rows with NaN values in important columns
df = df.dropna(subset=['Agent Utterance', 'Customer Utterance', 'Category', 'Rule Followed', 'Question Asked', 'Question Answered'])
# Merge Agent and Customer Utterances into a single conversation text
df['Conversation'] = df['Agent Utterance'] + " " + df['Customer Utterance']
# Define mappings for categories and labels
category_mapping = {
'Greeting': 0,
'Addressing Issue': 1,
'Feedback': 2,
'Resolution': 3,
'Address': 4
}
# Map categories to numeric labels
df['Category'] = df['Category'].map(category_mapping)
# Rule validation functions to check whether each rule was followed by the agent and whether the customer answered
def validate_rules(row):
missed_rules = []
missed_answers = []
# Rule checks for the agent
if 'hello' not in row['Agent Utterance'].lower() and 'hi' not in row['Agent Utterance'].lower():
missed_rules.append('Greeting')
if 'address' not in row['Agent Utterance'].lower():
missed_rules.append('Address')
if 'feedback' not in row['Agent Utterance'].lower():
missed_rules.append('Feedback')
if 'resolved' not in row['Agent Utterance'].lower() and 'fix' not in row['Agent Utterance'].lower():
missed_rules.append('Resolution')
# Check if customer answered relevant questions
if 'address' in row['Agent Utterance'].lower() and ('address' not in row['Customer Utterance'].lower()):
missed_answers.append('Customer Address Answer')
if 'feedback' in row['Agent Utterance'].lower() and ('yes' not in row['Customer Utterance'].lower() and 'no' not in row['Customer Utterance'].lower()):
missed_answers.append('Customer Feedback Answer')
# Returning the result as compliant or non-compliant
if len(missed_rules) == 0 and len(missed_answers) == 0:
return 1, [] # Compliant
else:
return 0, missed_rules + missed_answers # Non-Compliant
# Apply the rule validation to each row
df[['Compliant', 'Missed Rules/Answers']] = df.apply(lambda row: pd.Series(validate_rules(row)), axis=1)
# Splitting the data into training and validation datasets
train_texts, val_texts, train_labels, val_labels = train_test_split(df['Conversation'].tolist(), df['Compliant'].tolist(), test_size=0.2)
# Load pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# Tokenize the input data
def tokenize_function(examples):
return tokenizer(examples, padding="max_length", truncation=True, max_length=512)
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)
# Create Dataset objects for PyTorch
train_dataset = Dataset.from_dict({
'input_ids': train_encodings['input_ids'],
'attention_mask': train_encodings['attention_mask'],
'labels': train_labels
})
val_dataset = Dataset.from_dict({
'input_ids': val_encodings['input_ids'],
'attention_mask': val_encodings['attention_mask'],
'labels': val_labels
})
# Load pre-trained BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2) # Binary classification (Compliant vs Non-Compliant)
# Define compute_metrics function for evaluation
def compute_metrics(p):
predictions, labels = p
predictions = torch.argmax(predictions, axis=-1)
return {'accuracy': accuracy_score(labels, predictions)}
# Define training arguments for the Trainer
training_args = TrainingArguments(
output_dir='./results',
evaluation_strategy='epoch',
learning_rate=2e-5,
per_device_train_batch_size=8,
per_device_eval_batch_size=8,
num_train_epochs=3,
weight_decay=0.01,
logging_dir='./logs',
)
# Initialize Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=val_dataset,
compute_metrics=compute_metrics
)
# Train the model
trainer.train()
# Evaluate the model
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")
# Save the trained model
model.save_pretrained('./dishTV_bert_model')
tokenizer.save_pretrained('./dishTV_bert_model')
# Testing the model with an example
def predict(text):
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
outputs = model(**inputs)
prediction = torch.argmax(outputs.logits, dim=-1)
return prediction.item()
# Example test
test_text = "Hello! I need help with my DishTV subscription."
prediction = predict(test_text)
predicted_compliance = "Compliant" if prediction == 1 else "Non-Compliant"
print(f"Predicted Compliance: {predicted_compliance}")
|