ketanchaudhary88 commited on
Commit
d64149e
Β·
verified Β·
1 Parent(s): 4c3f463

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +129 -149
app.py CHANGED
@@ -1,161 +1,141 @@
1
- import torch
2
- from torch.utils.data import Dataset, DataLoader # Import Dataset here
3
- from transformers import BertTokenizer, BertForSequenceClassification
4
- from torch.optim import AdamW
5
- from sklearn.model_selection import train_test_split
6
- from sklearn.metrics import classification_report
7
  import pandas as pd
8
- from tqdm import tqdm
 
 
 
 
9
 
10
- # Initialize the tokenizer and model for BERT
11
- tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
12
- model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3) # Multi-label classification
13
-
14
- # Ensure the model is on the right device (GPU or CPU)
15
- device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
16
- model.to(device)
17
-
18
- # Custom Dataset Class for Conversation Data
19
- class ConversationDataset(Dataset):
20
- def __init__(self, agent_utterances, customer_utterances, rule_followed, question_asked, question_answered):
21
- self.agent_utterances = agent_utterances
22
- self.customer_utterances = customer_utterances
23
- self.rule_followed = rule_followed
24
- self.question_asked = question_asked
25
- self.question_answered = question_answered
26
-
27
- def __len__(self):
28
- return len(self.agent_utterances)
 
 
 
 
 
 
 
 
 
 
29
 
30
- def __getitem__(self, idx):
31
- agent_text = self.agent_utterances[idx]
32
- customer_text = self.customer_utterances[idx]
33
-
34
- # Combine both agent and customer utterances into one sequence
35
- input_text = agent_text + " [SEP] " + customer_text
36
-
37
- # Tokenize the input
38
- inputs = tokenizer(input_text, padding='max_length', truncation=True, max_length=128, return_tensors="pt")
39
-
40
- # Return inputs and labels for each task (Rule Followed, Question Asked, Question Answered)
41
- labels = torch.tensor([self.rule_followed[idx], self.question_asked[idx], self.question_answered[idx]], dtype=torch.long)
42
- return {**inputs, 'labels': labels}
43
-
44
- # Example of reading the data from CSV
45
- df = pd.read_csv('conversation_data.csv')
46
-
47
- # Extracting the agent and customer utterances along with labels
48
- agent_utterances = df['Agent Utterance'].tolist()
49
- customer_utterances = df['Customer Utterance'].tolist()
50
- rule_followed = df['Rule Followed'].tolist()
51
- question_asked = df['Question Asked'].tolist()
52
- question_answered = df['Question Answered'].tolist()
53
-
54
- # Split the data into training and validation sets
55
- X_train, X_val, y_train, y_val = train_test_split(
56
- list(zip(agent_utterances, customer_utterances)),
57
- list(zip(rule_followed, question_asked, question_answered)),
58
- test_size=0.2, random_state=42
59
- )
60
 
61
- # Convert to individual lists
62
- train_agent, train_customer = zip(*X_train)
63
- train_rule, train_question, train_answer = zip(*y_train)
 
 
64
 
65
- val_agent, val_customer = zip(*X_val)
66
- val_rule, val_question, val_answer = zip(*y_val)
67
 
68
- # Create dataset objects for training and validation
69
- train_dataset = ConversationDataset(
70
- train_agent, train_customer, train_rule, train_question, train_answer
71
- )
72
- val_dataset = ConversationDataset(
73
- val_agent, val_customer, val_rule, val_question, val_answer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
  )
75
 
76
- # Create DataLoader for training and validation
77
- train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
78
- val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)
 
 
 
 
 
79
 
80
- # Set optimizer
81
- optimizer = AdamW(model.parameters(), lr=1e-5)
82
 
83
- # Training loop
84
- for epoch in range(3): # Training for 3 epochs
85
- model.train()
86
- loop = tqdm(train_loader, desc=f"Epoch {epoch + 1}")
87
-
88
- for batch in loop:
89
- # Move batch to device (GPU if available)
90
- batch = {key: value.squeeze(1).to(device) for key, value in batch.items()}
91
-
92
- # Forward pass
93
- optimizer.zero_grad()
94
- outputs = model(**batch)
95
-
96
- # Compute loss for the multi-label classification
97
- loss = outputs.loss
98
- loss.backward()
99
-
100
- # Step the optimizer
101
- optimizer.step()
102
-
103
- loop.set_postfix(loss=loss.item())
104
-
105
- # After training, evaluate the model on the validation set
106
- model.eval()
107
- all_preds = []
108
- all_labels = []
109
-
110
- with torch.no_grad(): # Disable gradient calculation for inference
111
- for batch in val_loader:
112
- # Move the batch to the device (GPU/CPU)
113
- batch = {key: value.squeeze(1).to(device) for key, value in batch.items()}
114
-
115
- # Forward pass: Get logits
116
- outputs = model(**batch)
117
-
118
- # Get predictions (highest probability)
119
- logits = outputs.logits
120
- preds = torch.argmax(logits, dim=-1)
121
-
122
- # Append predictions and true labels
123
- all_preds.append(preds.cpu().numpy())
124
- all_labels.append(batch['labels'].cpu().numpy())
125
-
126
- # Flatten lists of predictions and labels
127
- all_preds = [item for sublist in all_preds for item in sublist]
128
- all_labels = [item for sublist in all_labels for item in sublist]
129
-
130
- # Print classification report for each task
131
- print(classification_report(all_labels, all_preds, target_names=["Rule Followed", "Question Asked", "Question Answered"]))
132
-
133
-
134
-
135
-
136
- # Test the model with new data
137
- # Example single prediction
138
- test_agent_utterance = "What is your account number?"
139
- test_customer_utterance = "888888"
140
-
141
- # Combine agent and customer utterance
142
- input_text = test_agent_utterance + " [SEP] " + test_customer_utterance
143
-
144
- # Tokenize the input
145
- inputs = tokenizer(input_text, return_tensors="pt", padding='max_length', truncation=True, max_length=128)
146
-
147
- # Move to the correct device (GPU or CPU)
148
- inputs = {key: value.to(device) for key, value in inputs.items()}
149
-
150
- # Predict using the model
151
- model.eval()
152
- with torch.no_grad():
153
- outputs = model(**inputs)
154
- logits = outputs.logits
155
- preds = torch.argmax(logits, dim=-1)
156
 
157
- # Display the prediction for each label
158
- print(f"Rule Followed: {preds[0][0].item()}")
159
- print(f"Question Asked: {preds[0][1].item()}")
160
- print(f"Question Answered: {preds[0][2].item()}")
161
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import pandas as pd
2
+ from sklearn.model_selection import train_test_split
3
+ from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
4
+ from datasets import Dataset
5
+ import torch
6
+ from sklearn.metrics import accuracy_score
7
 
8
+ # Load the CSV data
9
+ df = pd.read_csv("dishTV_customer_service_with_address_and_rules_accurate_v2.csv")
10
+
11
+ # Clean the dataset by dropping rows with NaN values in important columns
12
+ df = df.dropna(subset=['Agent Utterance', 'Customer Utterance', 'Category', 'Rule Followed', 'Question Asked', 'Question Answered'])
13
+
14
+ # Merge Agent and Customer Utterances into a single conversation text
15
+ df['Conversation'] = df['Agent Utterance'] + " " + df['Customer Utterance']
16
+
17
+ # Define mappings for categories and labels
18
+ category_mapping = {
19
+ 'Greeting': 0,
20
+ 'Addressing Issue': 1,
21
+ 'Feedback': 2,
22
+ 'Resolution': 3,
23
+ 'Address': 4
24
+ }
25
+
26
+ # Map categories to numeric labels
27
+ df['Category'] = df['Category'].map(category_mapping)
28
+
29
+ # Rule validation functions to check whether each rule was followed by the agent and whether the customer answered
30
+ def validate_rules(row):
31
+ missed_rules = []
32
+ missed_answers = []
33
+
34
+ # Rule checks for the agent
35
+ if 'hello' not in row['Agent Utterance'].lower() and 'hi' not in row['Agent Utterance'].lower():
36
+ missed_rules.append('Greeting')
37
 
38
+ if 'address' not in row['Agent Utterance'].lower():
39
+ missed_rules.append('Address')
40
+
41
+ if 'feedback' not in row['Agent Utterance'].lower():
42
+ missed_rules.append('Feedback')
43
+
44
+ if 'resolved' not in row['Agent Utterance'].lower() and 'fix' not in row['Agent Utterance'].lower():
45
+ missed_rules.append('Resolution')
46
+
47
+ # Check if customer answered relevant questions
48
+ if 'address' in row['Agent Utterance'].lower() and ('address' not in row['Customer Utterance'].lower()):
49
+ missed_answers.append('Customer Address Answer')
50
+
51
+ if 'feedback' in row['Agent Utterance'].lower() and ('yes' not in row['Customer Utterance'].lower() and 'no' not in row['Customer Utterance'].lower()):
52
+ missed_answers.append('Customer Feedback Answer')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
 
54
+ # Returning the result as compliant or non-compliant
55
+ if len(missed_rules) == 0 and len(missed_answers) == 0:
56
+ return 1, [] # Compliant
57
+ else:
58
+ return 0, missed_rules + missed_answers # Non-Compliant
59
 
60
+ # Apply the rule validation to each row
61
+ df[['Compliant', 'Missed Rules/Answers']] = df.apply(lambda row: pd.Series(validate_rules(row)), axis=1)
62
 
63
+ # Splitting the data into training and validation datasets
64
+ train_texts, val_texts, train_labels, val_labels = train_test_split(df['Conversation'].tolist(), df['Compliant'].tolist(), test_size=0.2)
65
+
66
+ # Load pre-trained BERT tokenizer
67
+ tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
68
+
69
+ # Tokenize the input data
70
+ def tokenize_function(examples):
71
+ return tokenizer(examples, padding="max_length", truncation=True, max_length=512)
72
+
73
+ train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
74
+ val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)
75
+
76
+ # Create Dataset objects for PyTorch
77
+ train_dataset = Dataset.from_dict({
78
+ 'input_ids': train_encodings['input_ids'],
79
+ 'attention_mask': train_encodings['attention_mask'],
80
+ 'labels': train_labels
81
+ })
82
+
83
+ val_dataset = Dataset.from_dict({
84
+ 'input_ids': val_encodings['input_ids'],
85
+ 'attention_mask': val_encodings['attention_mask'],
86
+ 'labels': val_labels
87
+ })
88
+
89
+ # Load pre-trained BERT model
90
+ model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2) # Binary classification (Compliant vs Non-Compliant)
91
+
92
+ # Define compute_metrics function for evaluation
93
+ def compute_metrics(p):
94
+ predictions, labels = p
95
+ predictions = torch.argmax(predictions, axis=-1)
96
+ return {'accuracy': accuracy_score(labels, predictions)}
97
+
98
+ # Define training arguments for the Trainer
99
+ training_args = TrainingArguments(
100
+ output_dir='./results',
101
+ evaluation_strategy='epoch',
102
+ learning_rate=2e-5,
103
+ per_device_train_batch_size=8,
104
+ per_device_eval_batch_size=8,
105
+ num_train_epochs=3,
106
+ weight_decay=0.01,
107
+ logging_dir='./logs',
108
  )
109
 
110
+ # Initialize Trainer
111
+ trainer = Trainer(
112
+ model=model,
113
+ args=training_args,
114
+ train_dataset=train_dataset,
115
+ eval_dataset=val_dataset,
116
+ compute_metrics=compute_metrics
117
+ )
118
 
119
+ # Train the model
120
+ trainer.train()
121
 
122
+ # Evaluate the model
123
+ eval_results = trainer.evaluate()
124
+ print(f"Evaluation results: {eval_results}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
125
 
126
+ # Save the trained model
127
+ model.save_pretrained('./dishTV_bert_model')
128
+ tokenizer.save_pretrained('./dishTV_bert_model')
 
129
 
130
+ # Testing the model with an example
131
+ def predict(text):
132
+ inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
133
+ outputs = model(**inputs)
134
+ prediction = torch.argmax(outputs.logits, dim=-1)
135
+ return prediction.item()
136
+
137
+ # Example test
138
+ test_text = "Hello! I need help with my DishTV subscription."
139
+ prediction = predict(test_text)
140
+ predicted_compliance = "Compliant" if prediction == 1 else "Non-Compliant"
141
+ print(f"Predicted Compliance: {predicted_compliance}")