ketanchaudhary88 commited on
Commit
1e4ae57
Β·
verified Β·
1 Parent(s): 2d7b37e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +62 -51
app.py CHANGED
@@ -1,28 +1,37 @@
1
- from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
2
- from datasets import Dataset
3
  import torch
4
- from sklearn.model_selection import train_test_split
5
  import pandas as pd
 
 
 
 
6
 
7
- # Load data (use your own dataset CSV here)
8
  df = pd.read_csv("dishTV_customer_service_with_address_and_rules_accurate_v2.csv")
9
- df = df.dropna(subset=['Agent Utterance', 'Customer Utterance', 'Category', 'Rule Followed', 'Question Asked', 'Question Answered'])
 
 
 
 
 
10
  df['Conversation'] = df['Agent Utterance'] + " " + df['Customer Utterance']
11
- df['Category'] = df['Category'].map({
12
- 'Greeting': 0,
13
- 'Addressing Issue': 1,
14
- 'Feedback': 2,
15
- 'Resolution': 3,
16
- 'Address': 4
17
- })
18
 
19
- # Split data
20
- train_texts, val_texts, train_labels, val_labels = train_test_split(df['Conversation'].tolist(), df['Compliant'].tolist(), test_size=0.2)
 
 
21
 
22
- # Tokenizer
 
 
 
 
 
 
 
 
23
  tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
24
 
25
- # Tokenize the inputs
26
  train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
27
  val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)
28
 
@@ -30,75 +39,77 @@ val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=5
30
  train_dataset = Dataset.from_dict({
31
  'input_ids': train_encodings['input_ids'],
32
  'attention_mask': train_encodings['attention_mask'],
33
- 'labels': train_labels
34
  })
35
 
36
  val_dataset = Dataset.from_dict({
37
  'input_ids': val_encodings['input_ids'],
38
  'attention_mask': val_encodings['attention_mask'],
39
- 'labels': val_labels
40
  })
41
 
42
- # Check dataset sizes
43
- print(f"Training dataset size: {len(train_dataset)}")
44
- print(f"Validation dataset size: {len(val_dataset)}")
45
 
46
- # Model
47
- model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
48
 
49
- # Calculate steps per epoch
50
- steps_per_epoch = len(train_dataset) // 8 # Assuming batch size = 8
51
- num_train_epochs = 3 # Desired number of epochs
52
- max_steps = steps_per_epoch * num_train_epochs
53
 
54
- # Define training arguments
55
  training_args = TrainingArguments(
56
- output_dir='results',
57
- eval_strategy='epoch', # Evaluate at the end of each epoch
58
  learning_rate=2e-5,
59
- per_device_train_batch_size=8, # You can adjust batch size here
60
  per_device_eval_batch_size=8,
61
- num_train_epochs=num_train_epochs, # Setting epochs to 3
62
  weight_decay=0.01,
63
- logging_dir='logs',
64
  logging_steps=500,
65
  save_steps=1000,
66
  load_best_model_at_end=True,
67
  metric_for_best_model="accuracy",
68
- max_steps=max_steps, # Limit the total steps
 
69
  )
70
 
 
 
71
  trainer = Trainer(
72
  model=model,
73
  args=training_args,
74
  train_dataset=train_dataset,
75
  eval_dataset=val_dataset,
76
- compute_metrics=lambda p: {'accuracy': (p.predictions.argmax(axis=-1) == p.label_ids).mean()}
 
 
77
  )
78
 
79
  # Start training
80
- print(f"Starting training for {num_train_epochs} epochs...")
81
  trainer.train()
82
 
 
83
  # Evaluate the model
84
- print("Evaluating model...")
85
  eval_results = trainer.evaluate()
86
  print(f"Evaluation results: {eval_results}")
87
 
88
 
89
- # Save the trained model
90
- model.save_pretrained('dishTV_bert_model')
91
- tokenizer.save_pretrained('dishTV_bert_model')
92
 
93
- # Testing the model with an example
94
- def predict(text):
95
- inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
96
- outputs = model(**inputs)
97
- prediction = torch.argmax(outputs.logits, dim=-1)
98
- return prediction.item()
 
 
 
 
 
 
 
 
 
 
 
 
99
 
100
- # Example test
101
- test_text = "Hello! I need help with my DishTV subscription."
102
- prediction = predict(test_text)
103
- predicted_compliance = "Compliant" if prediction == 1 else "Non-Compliant"
104
- print(f"Predicted Compliance: {predicted_compliance}")
 
 
 
1
  import torch
 
2
  import pandas as pd
3
+ from sklearn.model_selection import train_test_split
4
+ from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
5
+ from datasets import Dataset
6
+ import numpy as np
7
 
8
+ # Load your CSV file into a pandas DataFrame
9
  df = pd.read_csv("dishTV_customer_service_with_address_and_rules_accurate_v2.csv")
10
+
11
+ # Print column names and first few rows to ensure data structure
12
+ print(df.columns)
13
+ print(df.head())
14
+
15
+ # Create a conversation column by merging the agent's and customer's utterances
16
  df['Conversation'] = df['Agent Utterance'] + " " + df['Customer Utterance']
 
 
 
 
 
 
 
17
 
18
+ # Map labels for classification (Rule Followed, Question Asked, Question Answered)
19
+ df['Rule Followed'] = df['Rule Followed'].map({'Yes': 1, 'No': 0})
20
+ df['Question Asked'] = df['Question Asked'].map({'Yes': 1, 'No': 0})
21
+ df['Question Answered'] = df['Question Answered'].map({'Yes': 1, 'No': 0})
22
 
23
+ # Split data into training and validation sets
24
+ train_texts, val_texts, train_labels, val_labels = train_test_split(
25
+ df['Conversation'].tolist(),
26
+ df[['Rule Followed', 'Question Asked', 'Question Answered']].values,
27
+ test_size=0.2
28
+ )
29
+
30
+
31
+ # Initialize BERT tokenizer
32
  tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
33
 
34
+ # Tokenize the conversations
35
  train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
36
  val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)
37
 
 
39
  train_dataset = Dataset.from_dict({
40
  'input_ids': train_encodings['input_ids'],
41
  'attention_mask': train_encodings['attention_mask'],
42
+ 'labels': torch.tensor(train_labels, dtype=torch.float32)
43
  })
44
 
45
  val_dataset = Dataset.from_dict({
46
  'input_ids': val_encodings['input_ids'],
47
  'attention_mask': val_encodings['attention_mask'],
48
+ 'labels': torch.tensor(val_labels, dtype=torch.float32)
49
  })
50
 
 
 
 
51
 
52
+ # Initialize the BERT model for multi-label classification (3 labels)
53
+ model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)
54
 
 
 
 
 
55
 
56
+ # Define the training arguments
57
  training_args = TrainingArguments(
58
+ output_dir='./results',
59
+ evaluation_strategy='epoch', # Evaluate after each epoch
60
  learning_rate=2e-5,
61
+ per_device_train_batch_size=8,
62
  per_device_eval_batch_size=8,
63
+ num_train_epochs=3,
64
  weight_decay=0.01,
65
+ logging_dir='./logs',
66
  logging_steps=500,
67
  save_steps=1000,
68
  load_best_model_at_end=True,
69
  metric_for_best_model="accuracy",
70
+ do_train=True,
71
+ do_eval=True
72
  )
73
 
74
+
75
+ # Trainer setup
76
  trainer = Trainer(
77
  model=model,
78
  args=training_args,
79
  train_dataset=train_dataset,
80
  eval_dataset=val_dataset,
81
+ compute_metrics=lambda p: {
82
+ 'accuracy': np.mean(np.all(p.predictions.argmax(axis=-1) == p.label_ids, axis=1))
83
+ }
84
  )
85
 
86
  # Start training
 
87
  trainer.train()
88
 
89
+
90
  # Evaluate the model
 
91
  eval_results = trainer.evaluate()
92
  print(f"Evaluation results: {eval_results}")
93
 
94
 
 
 
 
95
 
96
+ # Define a new conversation for testing
97
+ new_conversation = ["Hello! How can I assist you today? I just wanted to check the status of my account."]
98
+
99
+ # Tokenize the new conversation
100
+ test_encodings = tokenizer(new_conversation, truncation=True, padding=True, max_length=512, return_tensors='pt')
101
+
102
+ # Make predictions
103
+ with torch.no_grad():
104
+ model.eval()
105
+ outputs = model(**test_encodings)
106
+ predictions = torch.sigmoid(outputs.logits).cpu().numpy() # Sigmoid for multi-label classification
107
+
108
+ # Display predictions
109
+ print(f"Predictions (Rule Followed, Question Asked, Question Answered): {predictions}")
110
+
111
+
112
+ # Round predictions (since we are doing binary classification for each label)
113
+ predictions_rounded = np.round(predictions)
114
 
115
+ print(f"Predictions (rounded): {predictions_rounded}")