ketanchaudhary88 commited on
Commit
1432ebd
Β·
verified Β·
1 Parent(s): bd4d894

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +134 -78
app.py CHANGED
@@ -3,98 +3,154 @@ import torch
3
  from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
4
  import pandas as pd
5
 
6
-
7
  # Load the CSV file into a pandas DataFrame
8
- dataset = pd.read_csv('customer_address_compliance_scenarios.csv')
9
 
10
- # Load multilingual BERT tokenizer
11
- tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")
12
 
13
- # Preprocess the data
14
- # df = df.dropna() # Optional: Drop rows with missing values
15
- #X = df.drop(columns=['target_column']) # Features
16
- #y = df['target_column'] # Target variable
17
 
18
- # Split dataset into training and evaluation sets
19
- train_data, eval_data = train_test_split(dataset, test_size=0.2)
20
- #X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
- # Tokenizer function that also keeps the label in the dataset
23
- def tokenize_function(example):
24
- tokenized_example = tokenizer(example['customer_input'], example['agent_response'], padding='max_length', truncation=True, max_length=512)
25
- tokenized_example['label'] = 1 if example['label'] == 'compliant' else 0 # Convert 'compliant' to 1 and 'non-compliant' to 0
26
- return tokenized_example
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
- # Apply tokenization to the entire dataset
29
- train_data = [tokenize_function(x) for x in train_data]
30
- eval_data = [tokenize_function(x) for x in eval_data]
31
 
32
- # Dataset class
33
- class DialogueDataset(torch.utils.data.Dataset):
34
- def __init__(self, data):
35
- self.data = data
36
- self.labels = [item['label'] for item in data]
37
 
38
- def __len__(self):
39
- return len(self.data)
 
 
40
 
41
- def __getitem__(self, idx):
42
- item = self.data[idx]
43
- input_ids = torch.tensor(item['input_ids'])
44
- attention_mask = torch.tensor(item['attention_mask'])
45
- label = torch.tensor(item['label'])
46
- return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": label}
47
-
48
-
49
- # Create PyTorch datasets
50
- train_dataset = DialogueDataset(train_data)
51
- eval_dataset = DialogueDataset(eval_data)
52
-
53
-
54
- # Load multilingual BERT model for sequence classification
55
- model = BertForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels=2)
56
-
57
- # Training arguments
58
- training_args = TrainingArguments(
59
- output_dir="./results",
60
- eval_strategy="epoch", # Evaluate every epoch
61
- per_device_train_batch_size=8,
62
- per_device_eval_batch_size=8,
63
- num_train_epochs=2,
64
- weight_decay=0.01,
65
- logging_dir='./logs',
66
- )
67
 
68
- # Trainer
69
- trainer = Trainer(
70
- model=model,
71
- args=training_args,
72
- train_dataset=train_dataset,
73
- eval_dataset=eval_dataset,
74
- )
75
 
76
- # Fine-tune the model
77
- trainer.train()
78
 
79
- # Evaluate the model
80
- eval_results = trainer.evaluate()
81
- print("Evaluation Results:", eval_results)
82
 
83
 
84
- def check_compliance(customer_input, agent_response):
85
- inputs = tokenizer(customer_input, agent_response, return_tensors="pt", padding=True, truncation=True, max_length=512)
86
- with torch.no_grad():
87
- outputs = model(**inputs)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
  logits = outputs.logits
89
- predicted_class = torch.argmax(logits, dim=-1).item()
90
 
91
- if predicted_class == 1:
92
- return "Compliant"
93
- else:
94
- return "Non-Compliant"
95
 
96
- # Test the model with new data
97
- test_customer_input = ""
98
- test_agent_response = "Is this your address ?"
99
- result = check_compliance(test_customer_input, test_agent_response)
100
- print(result)
 
3
  from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
4
  import pandas as pd
5
 
 
6
  # Load the CSV file into a pandas DataFrame
7
+ df = pd.read_csv('conversation_data.csv')
8
 
9
+ # Ensure that the columns are named correctly
10
+ print(df.head())
11
 
 
 
 
 
12
 
13
+ class ConversationDataset(Dataset):
14
+ def __init__(self, agent_utterances, customer_utterances, rule_followed, question_asked, question_answered):
15
+ self.agent_utterances = agent_utterances
16
+ self.customer_utterances = customer_utterances
17
+ self.rule_followed = rule_followed
18
+ self.question_asked = question_asked
19
+ self.question_answered = question_answered
20
+
21
+ def __len__(self):
22
+ return len(self.agent_utterances)
23
+
24
+ def __getitem__(self, idx):
25
+ agent_text = self.agent_utterances[idx]
26
+ customer_text = self.customer_utterances[idx]
27
+
28
+ # Combine both agent and customer utterances into one sequence
29
+ input_text = agent_text + " [SEP] " + customer_text
30
+
31
+ # Tokenize the input
32
+ inputs = tokenizer(input_text, padding='max_length', truncation=True, max_length=128, return_tensors="pt")
33
+
34
+ # Return inputs and labels for each task (Rule Followed, Question Asked, Question Answered)
35
+ labels = torch.tensor([self.rule_followed[idx], self.question_asked[idx], self.question_answered[idx]], dtype=torch.long)
36
+ return {**inputs, 'labels': labels}
37
+
38
+
39
+ # Initialize the tokenizer and model for BERT
40
+ tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
41
+ model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3) # Multi-label classification
42
+
43
+ # Check if GPU is available
44
+ device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
45
+ model.to(device)
46
+
47
+
48
+ # Extract agent utterances, customer utterances, and labels (rule_followed, question_asked, question_answered)
49
+ agent_utterances = df['Agent Utterance'].tolist()
50
+ customer_utterances = df['Customer Utterance'].tolist()
51
+ rule_followed = df['Rule Followed'].tolist()
52
+ question_asked = df['Question Asked'].tolist()
53
+ question_answered = df['Question Answered'].tolist()
54
+
55
+ # Split the data into training and validation sets
56
+ X_train, X_val, y_train, y_val = train_test_split(
57
+ list(zip(agent_utterances, customer_utterances)),
58
+ list(zip(rule_followed, question_asked, question_answered)),
59
+ test_size=0.2, random_state=42
60
+ )
61
 
62
+ # Convert to individual lists
63
+ train_agent, train_customer = zip(*X_train)
64
+ train_rule, train_question, train_answer = zip(*y_train)
65
+
66
+ val_agent, val_customer = zip(*X_val)
67
+ val_rule, val_question, val_answer = zip(*y_val)
68
+
69
+ # Create dataset objects for training and validation
70
+ train_dataset = ConversationDataset(train_agent, train_customer, train_rule, train_question, train_answer)
71
+ val_dataset = ConversationDataset(val_agent, val_customer, val_rule, val_question, val_answer)
72
+
73
+ # Create DataLoader for training and validation
74
+ train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
75
+ val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)
76
+
77
+
78
+ # Set optimizer
79
+ optimizer = AdamW(model.parameters(), lr=1e-5)
80
+
81
+ # Training loop
82
+ for epoch in range(3): # Training for 3 epochs
83
+ model.train()
84
+ loop = tqdm(train_loader, desc=f"Epoch {epoch + 1}")
85
+
86
+ for batch in loop:
87
+ # Move batch to device (GPU if available)
88
+ batch = {key: value.squeeze(1).to(device) for key, value in batch.items()}
89
+
90
+ # Forward pass
91
+ optimizer.zero_grad()
92
+ outputs = model(**batch)
93
+
94
+ # Compute loss for the multi-label classification
95
+ loss = outputs.loss
96
+ loss.backward()
97
+
98
+ # Step the optimizer
99
+ optimizer.step()
100
+
101
+ loop.set_postfix(loss=loss.item())
102
 
 
 
 
103
 
 
 
 
 
 
104
 
105
+ # Evaluate the model
106
+ model.eval()
107
+ all_preds = []
108
+ all_labels = []
109
 
110
+ with torch.no_grad():
111
+ for batch in val_loader:
112
+ batch = {key: value.squeeze(1).to(device) for key, value in batch.items()}
113
+ outputs = model(**batch)
114
+
115
+ # Get predictions (we are predicting 3 labels)
116
+ logits = outputs.logits
117
+ preds = torch.argmax(logits, dim=-1)
118
+
119
+ all_preds.append(preds.cpu().numpy())
120
+ all_labels.append(batch['labels'].cpu().numpy())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
121
 
122
+ # Flatten the lists
123
+ all_preds = [item for sublist in all_preds for item in sublist]
124
+ all_labels = [item for sublist in all_labels for item in sublist]
 
 
 
 
125
 
126
+ # Convert predictions and labels into multi-label format (for classification report)
127
+ print(classification_report(all_labels, all_preds, target_names=["Rule Followed", "Question Asked", "Question Answered"]))
128
 
 
 
 
129
 
130
 
131
+ # Test the model with new data
132
+ # Example single prediction
133
+ test_agent_utterance = "What is your account number?"
134
+ test_customer_utterance = "12345"
135
+
136
+ # Combine agent and customer utterance
137
+ input_text = test_agent_utterance + " [SEP] " + test_customer_utterance
138
+
139
+ # Tokenize the input
140
+ inputs = tokenizer(input_text, return_tensors="pt", padding='max_length', truncation=True, max_length=128)
141
+
142
+ # Move to the correct device (GPU or CPU)
143
+ inputs = {key: value.to(device) for key, value in inputs.items()}
144
+
145
+ # Predict using the model
146
+ model.eval()
147
+ with torch.no_grad():
148
+ outputs = model(**inputs)
149
  logits = outputs.logits
150
+ preds = torch.argmax(logits, dim=-1)
151
 
152
+ # Display the prediction for each label
153
+ print(f"Rule Followed: {preds[0][0].item()}")
154
+ print(f"Question Asked: {preds[0][1].item()}")
155
+ print(f"Question Answered: {preds[0][2].item()}")
156