Bert / app.py
ketanchaudhary88's picture
Update app.py
0d402f5 verified
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import numpy as np
# Load your CSV file into a pandas DataFrame
df = pd.read_csv("dishTV_customer_service_with_address_and_rules_accurate_v2.csv")
# Print column names and first few rows to ensure data structure
print(df.columns)
print(df.head())
# Create a conversation column by merging the agent's and customer's utterances
df['Conversation'] = df['Agent Utterance'] + " " + df['Customer Utterance']
# Map labels for classification (Rule Followed, Question Asked, Question Answered)
df['Rule Followed'] = df['Rule Followed'].map({'Yes': 1, 'No': 0})
df['Question Asked'] = df['Question Asked'].map({'Yes': 1, 'No': 0})
df['Question Answered'] = df['Question Answered'].map({'Yes': 1, 'No': 0})
# Split data into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
df['Conversation'].tolist(),
df[['Rule Followed', 'Question Asked', 'Question Answered']].values,
test_size=0.2
)
# Initialize BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# Tokenize the conversations
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=128)
# Create PyTorch datasets
train_dataset = Dataset.from_dict({
'input_ids': train_encodings['input_ids'],
'attention_mask': train_encodings['attention_mask'],
'labels': torch.tensor(train_labels, dtype=torch.float32)
})
val_dataset = Dataset.from_dict({
'input_ids': val_encodings['input_ids'],
'attention_mask': val_encodings['attention_mask'],
'labels': torch.tensor(val_labels, dtype=torch.float32)
})
# Initialize the BERT model for multi-label classification (3 labels)
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)
# Define the training arguments
training_args = TrainingArguments(
output_dir='./results',
eval_strategy='epoch', # Evaluate at the end of each epoch
save_strategy='epoch', # Save model at the end of each epoch
learning_rate=2e-5,
per_device_train_batch_size=32,
per_device_eval_batch_size=32,
num_train_epochs=3,
weight_decay=0.01,
logging_dir='./logs',
logging_steps=500,
save_steps=1000, # Optional, you can keep this if you want to save every N steps (only used if save_strategy is 'steps')
load_best_model_at_end=True,
metric_for_best_model="accuracy",
do_train=True,
do_eval=True
)
# Trainer setup
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=val_dataset,
compute_metrics=lambda p: {
'accuracy': np.mean(np.all(p.predictions.argmax(axis=-1) == p.label_ids, axis=1))
}
)
# Start training
trainer.train()
# Evaluate the model
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")
# Define a new conversation for testing
new_conversation = ["Hello! How can I assist you today? I just wanted to check the status of my account."]
# Tokenize the new conversation
test_encodings = tokenizer(new_conversation, truncation=True, padding=True, max_length=512, return_tensors='pt')
# Make predictions
with torch.no_grad():
model.eval()
outputs = model(**test_encodings)
predictions = torch.sigmoid(outputs.logits).cpu().numpy() # Sigmoid for multi-label classification
# Display predictions
print(f"Predictions (Rule Followed, Question Asked, Question Answered): {predictions}")
# Round predictions (since we are doing binary classification for each label)
predictions_rounded = np.round(predictions)
print(f"Predictions (rounded): {predictions_rounded}")