Spaces:

ketanchaudhary88
/

Bert

Runtime error

App Files Files Community

ketanchaudhary88 commited on Nov 17, 2024

Commit

d64149e

verified ·

1 Parent(s): 4c3f463

Update app.py

Browse files

Files changed (1) hide show

app.py +129 -149

app.py CHANGED Viewed

@@ -1,161 +1,141 @@
-import torch
-from torch.utils.data import Dataset, DataLoader  # Import Dataset here
-from transformers import BertTokenizer, BertForSequenceClassification
-from torch.optim import AdamW
-from sklearn.model_selection import train_test_split
-from sklearn.metrics import classification_report
 import pandas as pd
-from tqdm import tqdm
-# Initialize the tokenizer and model for BERT
-tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)  # Multi-label classification
-# Ensure the model is on the right device (GPU or CPU)
-device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
-model.to(device)
-# Custom Dataset Class for Conversation Data
-class ConversationDataset(Dataset):
-    def __init__(self, agent_utterances, customer_utterances, rule_followed, question_asked, question_answered):
-        self.agent_utterances = agent_utterances
-        self.customer_utterances = customer_utterances
-        self.rule_followed = rule_followed
-        self.question_asked = question_asked
-        self.question_answered = question_answered
-    def __len__(self):
-        return len(self.agent_utterances)
-    def __getitem__(self, idx):
-        agent_text = self.agent_utterances[idx]
-        customer_text = self.customer_utterances[idx]
-        # Combine both agent and customer utterances into one sequence
-        input_text = agent_text + " [SEP] " + customer_text
-        # Tokenize the input
-        inputs = tokenizer(input_text, padding='max_length', truncation=True, max_length=128, return_tensors="pt")
-        # Return inputs and labels for each task (Rule Followed, Question Asked, Question Answered)
-        labels = torch.tensor([self.rule_followed[idx], self.question_asked[idx], self.question_answered[idx]], dtype=torch.long)
-        return {**inputs, 'labels': labels}
-# Example of reading the data from CSV
-df = pd.read_csv('conversation_data.csv')
-# Extracting the agent and customer utterances along with labels
-agent_utterances = df['Agent Utterance'].tolist()
-customer_utterances = df['Customer Utterance'].tolist()
-rule_followed = df['Rule Followed'].tolist()
-question_asked = df['Question Asked'].tolist()
-question_answered = df['Question Answered'].tolist()
-# Split the data into training and validation sets
-X_train, X_val, y_train, y_val = train_test_split(
-    list(zip(agent_utterances, customer_utterances)),
-    list(zip(rule_followed, question_asked, question_answered)),
-    test_size=0.2, random_state=42
-)
-# Convert to individual lists
-train_agent, train_customer = zip(*X_train)
-train_rule, train_question, train_answer = zip(*y_train)
-val_agent, val_customer = zip(*X_val)
-val_rule, val_question, val_answer = zip(*y_val)
-# Create dataset objects for training and validation
-train_dataset = ConversationDataset(
-    train_agent, train_customer, train_rule, train_question, train_answer
-)
-val_dataset = ConversationDataset(
-    val_agent, val_customer, val_rule, val_question, val_answer
 )
-# Create DataLoader for training and validation
-train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
-val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)
-# Set optimizer
-optimizer = AdamW(model.parameters(), lr=1e-5)
-# Training loop
-for epoch in range(3):  # Training for 3 epochs
-    model.train()
-    loop = tqdm(train_loader, desc=f"Epoch {epoch + 1}")
-    for batch in loop:
-        # Move batch to device (GPU if available)
-        batch = {key: value.squeeze(1).to(device) for key, value in batch.items()}
-        # Forward pass
-        optimizer.zero_grad()
-        outputs = model(**batch)
-        # Compute loss for the multi-label classification
-        loss = outputs.loss
-        loss.backward()
-        # Step the optimizer
-        optimizer.step()
-        loop.set_postfix(loss=loss.item())
-# After training, evaluate the model on the validation set
-model.eval()
-all_preds = []
-all_labels = []
-with torch.no_grad():  # Disable gradient calculation for inference
-    for batch in val_loader:
-        # Move the batch to the device (GPU/CPU)
-        batch = {key: value.squeeze(1).to(device) for key, value in batch.items()}
-        # Forward pass: Get logits
-        outputs = model(**batch)
-        # Get predictions (highest probability)
-        logits = outputs.logits
-        preds = torch.argmax(logits, dim=-1)
-        # Append predictions and true labels
-        all_preds.append(preds.cpu().numpy())
-        all_labels.append(batch['labels'].cpu().numpy())
-# Flatten lists of predictions and labels
-all_preds = [item for sublist in all_preds for item in sublist]
-all_labels = [item for sublist in all_labels for item in sublist]
-# Print classification report for each task
-print(classification_report(all_labels, all_preds, target_names=["Rule Followed", "Question Asked", "Question Answered"]))
-# Test the model with new data
-# Example single prediction
-test_agent_utterance = "What is your account number?"
-test_customer_utterance = "888888"
-# Combine agent and customer utterance
-input_text = test_agent_utterance + " [SEP] " + test_customer_utterance
-# Tokenize the input
-inputs = tokenizer(input_text, return_tensors="pt", padding='max_length', truncation=True, max_length=128)
-# Move to the correct device (GPU or CPU)
-inputs = {key: value.to(device) for key, value in inputs.items()}
-# Predict using the model
-model.eval()
-with torch.no_grad():
-    outputs = model(**inputs)
-    logits = outputs.logits
-    preds = torch.argmax(logits, dim=-1)
-# Display the prediction for each label
-print(f"Rule Followed: {preds[0][0].item()}")
-print(f"Question Asked: {preds[0][1].item()}")
-print(f"Question Answered: {preds[0][2].item()}")

 import pandas as pd
+from sklearn.model_selection import train_test_split
+from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
+from datasets import Dataset
+import torch
+from sklearn.metrics import accuracy_score
+# Load the CSV data
+df = pd.read_csv("dishTV_customer_service_with_address_and_rules_accurate_v2.csv")
+# Clean the dataset by dropping rows with NaN values in important columns
+df = df.dropna(subset=['Agent Utterance', 'Customer Utterance', 'Category', 'Rule Followed', 'Question Asked', 'Question Answered'])
+# Merge Agent and Customer Utterances into a single conversation text
+df['Conversation'] = df['Agent Utterance'] + " " + df['Customer Utterance']
+# Define mappings for categories and labels
+category_mapping = {
+    'Greeting': 0,
+    'Addressing Issue': 1,
+    'Feedback': 2,
+    'Resolution': 3,
+    'Address': 4
+}
+# Map categories to numeric labels
+df['Category'] = df['Category'].map(category_mapping)
+# Rule validation functions to check whether each rule was followed by the agent and whether the customer answered
+def validate_rules(row):
+    missed_rules = []
+    missed_answers = []
+    # Rule checks for the agent
+    if 'hello' not in row['Agent Utterance'].lower() and 'hi' not in row['Agent Utterance'].lower():
+        missed_rules.append('Greeting')
+    if 'address' not in row['Agent Utterance'].lower():
+        missed_rules.append('Address')
+    if 'feedback' not in row['Agent Utterance'].lower():
+        missed_rules.append('Feedback')
+    if 'resolved' not in row['Agent Utterance'].lower() and 'fix' not in row['Agent Utterance'].lower():
+        missed_rules.append('Resolution')
+    # Check if customer answered relevant questions
+    if 'address' in row['Agent Utterance'].lower() and ('address' not in row['Customer Utterance'].lower()):
+        missed_answers.append('Customer Address Answer')
+    if 'feedback' in row['Agent Utterance'].lower() and ('yes' not in row['Customer Utterance'].lower() and 'no' not in row['Customer Utterance'].lower()):
+        missed_answers.append('Customer Feedback Answer')
+    # Returning the result as compliant or non-compliant
+    if len(missed_rules) == 0 and len(missed_answers) == 0:
+        return 1, []  # Compliant
+    else:
+        return 0, missed_rules + missed_answers  # Non-Compliant
+# Apply the rule validation to each row
+df[['Compliant', 'Missed Rules/Answers']] = df.apply(lambda row: pd.Series(validate_rules(row)), axis=1)
+# Splitting the data into training and validation datasets
+train_texts, val_texts, train_labels, val_labels = train_test_split(df['Conversation'].tolist(), df['Compliant'].tolist(), test_size=0.2)
+# Load pre-trained BERT tokenizer
+tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+# Tokenize the input data
+def tokenize_function(examples):
+    return tokenizer(examples, padding="max_length", truncation=True, max_length=512)
+train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
+val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)
+# Create Dataset objects for PyTorch
+train_dataset = Dataset.from_dict({
+    'input_ids': train_encodings['input_ids'],
+    'attention_mask': train_encodings['attention_mask'],
+    'labels': train_labels
+})
+val_dataset = Dataset.from_dict({
+    'input_ids': val_encodings['input_ids'],
+    'attention_mask': val_encodings['attention_mask'],
+    'labels': val_labels
+})
+# Load pre-trained BERT model
+model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)  # Binary classification (Compliant vs Non-Compliant)
+# Define compute_metrics function for evaluation
+def compute_metrics(p):
+    predictions, labels = p
+    predictions = torch.argmax(predictions, axis=-1)
+    return {'accuracy': accuracy_score(labels, predictions)}
+# Define training arguments for the Trainer
+training_args = TrainingArguments(
+    output_dir='./results',
+    evaluation_strategy='epoch',
+    learning_rate=2e-5,
+    per_device_train_batch_size=8,
+    per_device_eval_batch_size=8,
+    num_train_epochs=3,
+    weight_decay=0.01,
+    logging_dir='./logs',
 )
+# Initialize Trainer
+trainer = Trainer(
+    model=model,
+    args=training_args,
+    train_dataset=train_dataset,
+    eval_dataset=val_dataset,
+    compute_metrics=compute_metrics
+)
+# Train the model
+trainer.train()
+# Evaluate the model
+eval_results = trainer.evaluate()
+print(f"Evaluation results: {eval_results}")
+# Save the trained model
+model.save_pretrained('./dishTV_bert_model')
+tokenizer.save_pretrained('./dishTV_bert_model')
+# Testing the model with an example
+def predict(text):
+    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
+    outputs = model(**inputs)
+    prediction = torch.argmax(outputs.logits, dim=-1)
+    return prediction.item()
+# Example test
+test_text = "Hello! I need help with my DishTV subscription."
+prediction = predict(test_text)
+predicted_compliance = "Compliant" if prediction == 1 else "Non-Compliant"
+print(f"Predicted Compliance: {predicted_compliance}")