ketanchaudhary88 commited on
Commit
e7ed4e3
Β·
verified Β·
1 Parent(s): 1432ebd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +33 -28
app.py CHANGED
@@ -1,15 +1,21 @@
1
- from sklearn.model_selection import train_test_split
2
  import torch
3
- from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
 
 
 
 
4
  import pandas as pd
 
5
 
6
- # Load the CSV file into a pandas DataFrame
7
- df = pd.read_csv('conversation_data.csv')
8
-
9
- # Ensure that the columns are named correctly
10
- print(df.head())
11
 
 
 
 
12
 
 
13
  class ConversationDataset(Dataset):
14
  def __init__(self, agent_utterances, customer_utterances, rule_followed, question_asked, question_answered):
15
  self.agent_utterances = agent_utterances
@@ -35,17 +41,10 @@ class ConversationDataset(Dataset):
35
  labels = torch.tensor([self.rule_followed[idx], self.question_asked[idx], self.question_answered[idx]], dtype=torch.long)
36
  return {**inputs, 'labels': labels}
37
 
 
 
38
 
39
- # Initialize the tokenizer and model for BERT
40
- tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
41
- model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3) # Multi-label classification
42
-
43
- # Check if GPU is available
44
- device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
45
- model.to(device)
46
-
47
-
48
- # Extract agent utterances, customer utterances, and labels (rule_followed, question_asked, question_answered)
49
  agent_utterances = df['Agent Utterance'].tolist()
50
  customer_utterances = df['Customer Utterance'].tolist()
51
  rule_followed = df['Rule Followed'].tolist()
@@ -67,14 +66,17 @@ val_agent, val_customer = zip(*X_val)
67
  val_rule, val_question, val_answer = zip(*y_val)
68
 
69
  # Create dataset objects for training and validation
70
- train_dataset = ConversationDataset(train_agent, train_customer, train_rule, train_question, train_answer)
71
- val_dataset = ConversationDataset(val_agent, val_customer, val_rule, val_question, val_answer)
 
 
 
 
72
 
73
  # Create DataLoader for training and validation
74
  train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
75
  val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)
76
 
77
-
78
  # Set optimizer
79
  optimizer = AdamW(model.parameters(), lr=1e-5)
80
 
@@ -100,38 +102,41 @@ for epoch in range(3): # Training for 3 epochs
100
 
101
  loop.set_postfix(loss=loss.item())
102
 
103
-
104
-
105
- # Evaluate the model
106
  model.eval()
107
  all_preds = []
108
  all_labels = []
109
 
110
- with torch.no_grad():
111
  for batch in val_loader:
 
112
  batch = {key: value.squeeze(1).to(device) for key, value in batch.items()}
 
 
113
  outputs = model(**batch)
114
 
115
- # Get predictions (we are predicting 3 labels)
116
  logits = outputs.logits
117
  preds = torch.argmax(logits, dim=-1)
118
 
 
119
  all_preds.append(preds.cpu().numpy())
120
  all_labels.append(batch['labels'].cpu().numpy())
121
 
122
- # Flatten the lists
123
  all_preds = [item for sublist in all_preds for item in sublist]
124
  all_labels = [item for sublist in all_labels for item in sublist]
125
 
126
- # Convert predictions and labels into multi-label format (for classification report)
127
  print(classification_report(all_labels, all_preds, target_names=["Rule Followed", "Question Asked", "Question Answered"]))
128
 
129
 
130
 
 
131
  # Test the model with new data
132
  # Example single prediction
133
  test_agent_utterance = "What is your account number?"
134
- test_customer_utterance = "12345"
135
 
136
  # Combine agent and customer utterance
137
  input_text = test_agent_utterance + " [SEP] " + test_customer_utterance
 
 
1
  import torch
2
+ from torch.utils.data import Dataset, DataLoader # Import Dataset here
3
+ from transformers import BertTokenizer, BertForSequenceClassification
4
+ from torch.optim import AdamW
5
+ from sklearn.model_selection import train_test_split
6
+ from sklearn.metrics import classification_report
7
  import pandas as pd
8
+ from tqdm import tqdm
9
 
10
+ # Initialize the tokenizer and model for BERT
11
+ tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
12
+ model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3) # Multi-label classification
 
 
13
 
14
+ # Ensure the model is on the right device (GPU or CPU)
15
+ device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
16
+ model.to(device)
17
 
18
+ # Custom Dataset Class for Conversation Data
19
  class ConversationDataset(Dataset):
20
  def __init__(self, agent_utterances, customer_utterances, rule_followed, question_asked, question_answered):
21
  self.agent_utterances = agent_utterances
 
41
  labels = torch.tensor([self.rule_followed[idx], self.question_asked[idx], self.question_answered[idx]], dtype=torch.long)
42
  return {**inputs, 'labels': labels}
43
 
44
+ # Example of reading the data from CSV
45
+ df = pd.read_csv('conversation_data.csv')
46
 
47
+ # Extracting the agent and customer utterances along with labels
 
 
 
 
 
 
 
 
 
48
  agent_utterances = df['Agent Utterance'].tolist()
49
  customer_utterances = df['Customer Utterance'].tolist()
50
  rule_followed = df['Rule Followed'].tolist()
 
66
  val_rule, val_question, val_answer = zip(*y_val)
67
 
68
  # Create dataset objects for training and validation
69
+ train_dataset = ConversationDataset(
70
+ train_agent, train_customer, train_rule, train_question, train_answer
71
+ )
72
+ val_dataset = ConversationDataset(
73
+ val_agent, val_customer, val_rule, val_question, val_answer
74
+ )
75
 
76
  # Create DataLoader for training and validation
77
  train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
78
  val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)
79
 
 
80
  # Set optimizer
81
  optimizer = AdamW(model.parameters(), lr=1e-5)
82
 
 
102
 
103
  loop.set_postfix(loss=loss.item())
104
 
105
+ # After training, evaluate the model on the validation set
 
 
106
  model.eval()
107
  all_preds = []
108
  all_labels = []
109
 
110
+ with torch.no_grad(): # Disable gradient calculation for inference
111
  for batch in val_loader:
112
+ # Move the batch to the device (GPU/CPU)
113
  batch = {key: value.squeeze(1).to(device) for key, value in batch.items()}
114
+
115
+ # Forward pass: Get logits
116
  outputs = model(**batch)
117
 
118
+ # Get predictions (highest probability)
119
  logits = outputs.logits
120
  preds = torch.argmax(logits, dim=-1)
121
 
122
+ # Append predictions and true labels
123
  all_preds.append(preds.cpu().numpy())
124
  all_labels.append(batch['labels'].cpu().numpy())
125
 
126
+ # Flatten lists of predictions and labels
127
  all_preds = [item for sublist in all_preds for item in sublist]
128
  all_labels = [item for sublist in all_labels for item in sublist]
129
 
130
+ # Print classification report for each task
131
  print(classification_report(all_labels, all_preds, target_names=["Rule Followed", "Question Asked", "Question Answered"]))
132
 
133
 
134
 
135
+
136
  # Test the model with new data
137
  # Example single prediction
138
  test_agent_utterance = "What is your account number?"
139
+ test_customer_utterance = "888888"
140
 
141
  # Combine agent and customer utterance
142
  input_text = test_agent_utterance + " [SEP] " + test_customer_utterance