ketanchaudhary88 commited on
Commit
a9dfd01
Β·
verified Β·
1 Parent(s): d42bf93

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +36 -73
app.py CHANGED
@@ -1,79 +1,32 @@
1
- import pandas as pd
2
- from sklearn.model_selection import train_test_split
3
  from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
4
  from datasets import Dataset
5
  import torch
6
- from sklearn.metrics import accuracy_score
 
7
 
8
- # Load the CSV data
9
  df = pd.read_csv("dishTV_customer_service_with_address_and_rules_accurate_v2.csv")
10
-
11
- # Clean the dataset by dropping rows with NaN values in important columns
12
  df = df.dropna(subset=['Agent Utterance', 'Customer Utterance', 'Category', 'Rule Followed', 'Question Asked', 'Question Answered'])
13
-
14
- # Merge Agent and Customer Utterances into a single conversation text
15
  df['Conversation'] = df['Agent Utterance'] + " " + df['Customer Utterance']
16
-
17
- # Define mappings for categories and labels
18
- category_mapping = {
19
  'Greeting': 0,
20
  'Addressing Issue': 1,
21
  'Feedback': 2,
22
  'Resolution': 3,
23
  'Address': 4
24
- }
25
-
26
- # Map categories to numeric labels
27
- df['Category'] = df['Category'].map(category_mapping)
28
-
29
- # Rule validation functions to check whether each rule was followed by the agent and whether the customer answered
30
- def validate_rules(row):
31
- missed_rules = []
32
- missed_answers = []
33
-
34
- # Rule checks for the agent
35
- if 'hello' not in row['Agent Utterance'].lower() and 'hi' not in row['Agent Utterance'].lower():
36
- missed_rules.append('Greeting')
37
-
38
- if 'address' not in row['Agent Utterance'].lower():
39
- missed_rules.append('Address')
40
-
41
- if 'feedback' not in row['Agent Utterance'].lower():
42
- missed_rules.append('Feedback')
43
-
44
- if 'resolved' not in row['Agent Utterance'].lower() and 'fix' not in row['Agent Utterance'].lower():
45
- missed_rules.append('Resolution')
46
-
47
- # Check if customer answered relevant questions
48
- if 'address' in row['Agent Utterance'].lower() and ('address' not in row['Customer Utterance'].lower()):
49
- missed_answers.append('Customer Address Answer')
50
-
51
- if 'feedback' in row['Agent Utterance'].lower() and ('yes' not in row['Customer Utterance'].lower() and 'no' not in row['Customer Utterance'].lower()):
52
- missed_answers.append('Customer Feedback Answer')
53
-
54
- # Returning the result as compliant or non-compliant
55
- if len(missed_rules) == 0 and len(missed_answers) == 0:
56
- return 1, [] # Compliant
57
- else:
58
- return 0, missed_rules + missed_answers # Non-Compliant
59
-
60
- # Apply the rule validation to each row
61
- df[['Compliant', 'Missed Rules/Answers']] = df.apply(lambda row: pd.Series(validate_rules(row)), axis=1)
62
-
63
- # Splitting the data into training and validation datasets
64
  train_texts, val_texts, train_labels, val_labels = train_test_split(df['Conversation'].tolist(), df['Compliant'].tolist(), test_size=0.2)
65
 
66
- # Load pre-trained BERT tokenizer
67
  tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
68
 
69
- # Tokenize the input data
70
- def tokenize_function(examples):
71
- return tokenizer(examples, padding="max_length", truncation=True, max_length=512)
72
-
73
  train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
74
  val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)
75
 
76
- # Create Dataset objects for PyTorch
77
  train_dataset = Dataset.from_dict({
78
  'input_ids': train_encodings['input_ids'],
79
  'attention_mask': train_encodings['attention_mask'],
@@ -86,46 +39,56 @@ val_dataset = Dataset.from_dict({
86
  'labels': val_labels
87
  })
88
 
89
- # Load pre-trained BERT model
90
- model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2) # Binary classification (Compliant vs Non-Compliant)
 
 
 
 
91
 
92
- # Define compute_metrics function for evaluation
93
- def compute_metrics(p):
94
- predictions, labels = p
95
- predictions = torch.argmax(predictions, axis=-1)
96
- return {'accuracy': accuracy_score(labels, predictions)}
97
 
98
- # Define training arguments for the Trainer
99
  training_args = TrainingArguments(
100
  output_dir='./results',
101
- evaluation_strategy='epoch',
102
  learning_rate=2e-5,
103
- per_device_train_batch_size=8,
104
  per_device_eval_batch_size=8,
105
- num_train_epochs=3,
106
  weight_decay=0.01,
107
  logging_dir='./logs',
 
 
 
 
 
108
  )
109
 
110
- # Initialize Trainer
111
  trainer = Trainer(
112
  model=model,
113
  args=training_args,
114
  train_dataset=train_dataset,
115
  eval_dataset=val_dataset,
116
- compute_metrics=compute_metrics
117
  )
118
 
119
- # Train the model
 
120
  trainer.train()
121
 
122
  # Evaluate the model
 
123
  eval_results = trainer.evaluate()
124
  print(f"Evaluation results: {eval_results}")
125
 
 
126
  # Save the trained model
127
- model.save_pretrained('./dishTV_bert_model')
128
- tokenizer.save_pretrained('./dishTV_bert_model')
129
 
130
  # Testing the model with an example
131
  def predict(text):
 
 
 
1
  from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
2
  from datasets import Dataset
3
  import torch
4
+ from sklearn.model_selection import train_test_split
5
+ import pandas as pd
6
 
7
+ # Load data (use your own dataset CSV here)
8
  df = pd.read_csv("dishTV_customer_service_with_address_and_rules_accurate_v2.csv")
 
 
9
  df = df.dropna(subset=['Agent Utterance', 'Customer Utterance', 'Category', 'Rule Followed', 'Question Asked', 'Question Answered'])
 
 
10
  df['Conversation'] = df['Agent Utterance'] + " " + df['Customer Utterance']
11
+ df['Category'] = df['Category'].map({
 
 
12
  'Greeting': 0,
13
  'Addressing Issue': 1,
14
  'Feedback': 2,
15
  'Resolution': 3,
16
  'Address': 4
17
+ })
18
+
19
+ # Split data
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  train_texts, val_texts, train_labels, val_labels = train_test_split(df['Conversation'].tolist(), df['Compliant'].tolist(), test_size=0.2)
21
 
22
+ # Tokenizer
23
  tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
24
 
25
+ # Tokenize the inputs
 
 
 
26
  train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
27
  val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)
28
 
29
+ # Create PyTorch datasets
30
  train_dataset = Dataset.from_dict({
31
  'input_ids': train_encodings['input_ids'],
32
  'attention_mask': train_encodings['attention_mask'],
 
39
  'labels': val_labels
40
  })
41
 
42
+ # Check dataset sizes
43
+ print(f"Training dataset size: {len(train_dataset)}")
44
+ print(f"Validation dataset size: {len(val_dataset)}")
45
+
46
+ # Model
47
+ model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
48
 
49
+ # Calculate steps per epoch
50
+ steps_per_epoch = len(train_dataset) // 8 # Assuming batch size = 8
51
+ num_train_epochs = 3 # Desired number of epochs
52
+ max_steps = steps_per_epoch * num_train_epochs
 
53
 
54
+ # Define training arguments
55
  training_args = TrainingArguments(
56
  output_dir='./results',
57
+ evaluation_strategy='epoch', # Evaluate at the end of each epoch
58
  learning_rate=2e-5,
59
+ per_device_train_batch_size=8, # You can adjust batch size here
60
  per_device_eval_batch_size=8,
61
+ num_train_epochs=num_train_epochs, # Setting epochs to 3
62
  weight_decay=0.01,
63
  logging_dir='./logs',
64
+ logging_steps=500,
65
+ save_steps=1000,
66
+ load_best_model_at_end=True,
67
+ metric_for_best_model="accuracy",
68
+ max_steps=max_steps, # Limit the total steps
69
  )
70
 
 
71
  trainer = Trainer(
72
  model=model,
73
  args=training_args,
74
  train_dataset=train_dataset,
75
  eval_dataset=val_dataset,
76
+ compute_metrics=lambda p: {'accuracy': (p.predictions.argmax(axis=-1) == p.label_ids).mean()}
77
  )
78
 
79
+ # Start training
80
+ print(f"Starting training for {num_train_epochs} epochs...")
81
  trainer.train()
82
 
83
  # Evaluate the model
84
+ print("Evaluating model...")
85
  eval_results = trainer.evaluate()
86
  print(f"Evaluation results: {eval_results}")
87
 
88
+
89
  # Save the trained model
90
+ model.save_pretrained('dishTV_bert_model')
91
+ tokenizer.save_pretrained('dishTV_bert_model')
92
 
93
  # Testing the model with an example
94
  def predict(text):