Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -1,28 +1,37 @@
|
|
1 |
-
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
|
2 |
-
from datasets import Dataset
|
3 |
import torch
|
4 |
-
from sklearn.model_selection import train_test_split
|
5 |
import pandas as pd
|
|
|
|
|
|
|
|
|
6 |
|
7 |
-
# Load
|
8 |
df = pd.read_csv("dishTV_customer_service_with_address_and_rules_accurate_v2.csv")
|
9 |
-
|
|
|
|
|
|
|
|
|
|
|
10 |
df['Conversation'] = df['Agent Utterance'] + " " + df['Customer Utterance']
|
11 |
-
df['Category'] = df['Category'].map({
|
12 |
-
'Greeting': 0,
|
13 |
-
'Addressing Issue': 1,
|
14 |
-
'Feedback': 2,
|
15 |
-
'Resolution': 3,
|
16 |
-
'Address': 4
|
17 |
-
})
|
18 |
|
19 |
-
#
|
20 |
-
|
|
|
|
|
21 |
|
22 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
|
24 |
|
25 |
-
# Tokenize the
|
26 |
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
|
27 |
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)
|
28 |
|
@@ -30,75 +39,77 @@ val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=5
|
|
30 |
train_dataset = Dataset.from_dict({
|
31 |
'input_ids': train_encodings['input_ids'],
|
32 |
'attention_mask': train_encodings['attention_mask'],
|
33 |
-
'labels': train_labels
|
34 |
})
|
35 |
|
36 |
val_dataset = Dataset.from_dict({
|
37 |
'input_ids': val_encodings['input_ids'],
|
38 |
'attention_mask': val_encodings['attention_mask'],
|
39 |
-
'labels': val_labels
|
40 |
})
|
41 |
|
42 |
-
# Check dataset sizes
|
43 |
-
print(f"Training dataset size: {len(train_dataset)}")
|
44 |
-
print(f"Validation dataset size: {len(val_dataset)}")
|
45 |
|
46 |
-
#
|
47 |
-
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=
|
48 |
|
49 |
-
# Calculate steps per epoch
|
50 |
-
steps_per_epoch = len(train_dataset) // 8 # Assuming batch size = 8
|
51 |
-
num_train_epochs = 3 # Desired number of epochs
|
52 |
-
max_steps = steps_per_epoch * num_train_epochs
|
53 |
|
54 |
-
# Define training arguments
|
55 |
training_args = TrainingArguments(
|
56 |
-
output_dir='results',
|
57 |
-
|
58 |
learning_rate=2e-5,
|
59 |
-
per_device_train_batch_size=8,
|
60 |
per_device_eval_batch_size=8,
|
61 |
-
num_train_epochs=
|
62 |
weight_decay=0.01,
|
63 |
-
logging_dir='logs',
|
64 |
logging_steps=500,
|
65 |
save_steps=1000,
|
66 |
load_best_model_at_end=True,
|
67 |
metric_for_best_model="accuracy",
|
68 |
-
|
|
|
69 |
)
|
70 |
|
|
|
|
|
71 |
trainer = Trainer(
|
72 |
model=model,
|
73 |
args=training_args,
|
74 |
train_dataset=train_dataset,
|
75 |
eval_dataset=val_dataset,
|
76 |
-
compute_metrics=lambda p: {
|
|
|
|
|
77 |
)
|
78 |
|
79 |
# Start training
|
80 |
-
print(f"Starting training for {num_train_epochs} epochs...")
|
81 |
trainer.train()
|
82 |
|
|
|
83 |
# Evaluate the model
|
84 |
-
print("Evaluating model...")
|
85 |
eval_results = trainer.evaluate()
|
86 |
print(f"Evaluation results: {eval_results}")
|
87 |
|
88 |
|
89 |
-
# Save the trained model
|
90 |
-
model.save_pretrained('dishTV_bert_model')
|
91 |
-
tokenizer.save_pretrained('dishTV_bert_model')
|
92 |
|
93 |
-
#
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
99 |
|
100 |
-
|
101 |
-
test_text = "Hello! I need help with my DishTV subscription."
|
102 |
-
prediction = predict(test_text)
|
103 |
-
predicted_compliance = "Compliant" if prediction == 1 else "Non-Compliant"
|
104 |
-
print(f"Predicted Compliance: {predicted_compliance}")
|
|
|
|
|
|
|
1 |
import torch
|
|
|
2 |
import pandas as pd
|
3 |
+
from sklearn.model_selection import train_test_split
|
4 |
+
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
|
5 |
+
from datasets import Dataset
|
6 |
+
import numpy as np
|
7 |
|
8 |
+
# Load your CSV file into a pandas DataFrame
|
9 |
df = pd.read_csv("dishTV_customer_service_with_address_and_rules_accurate_v2.csv")
|
10 |
+
|
11 |
+
# Print column names and first few rows to ensure data structure
|
12 |
+
print(df.columns)
|
13 |
+
print(df.head())
|
14 |
+
|
15 |
+
# Create a conversation column by merging the agent's and customer's utterances
|
16 |
df['Conversation'] = df['Agent Utterance'] + " " + df['Customer Utterance']
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
|
18 |
+
# Map labels for classification (Rule Followed, Question Asked, Question Answered)
|
19 |
+
df['Rule Followed'] = df['Rule Followed'].map({'Yes': 1, 'No': 0})
|
20 |
+
df['Question Asked'] = df['Question Asked'].map({'Yes': 1, 'No': 0})
|
21 |
+
df['Question Answered'] = df['Question Answered'].map({'Yes': 1, 'No': 0})
|
22 |
|
23 |
+
# Split data into training and validation sets
|
24 |
+
train_texts, val_texts, train_labels, val_labels = train_test_split(
|
25 |
+
df['Conversation'].tolist(),
|
26 |
+
df[['Rule Followed', 'Question Asked', 'Question Answered']].values,
|
27 |
+
test_size=0.2
|
28 |
+
)
|
29 |
+
|
30 |
+
|
31 |
+
# Initialize BERT tokenizer
|
32 |
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
|
33 |
|
34 |
+
# Tokenize the conversations
|
35 |
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
|
36 |
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)
|
37 |
|
|
|
39 |
train_dataset = Dataset.from_dict({
|
40 |
'input_ids': train_encodings['input_ids'],
|
41 |
'attention_mask': train_encodings['attention_mask'],
|
42 |
+
'labels': torch.tensor(train_labels, dtype=torch.float32)
|
43 |
})
|
44 |
|
45 |
val_dataset = Dataset.from_dict({
|
46 |
'input_ids': val_encodings['input_ids'],
|
47 |
'attention_mask': val_encodings['attention_mask'],
|
48 |
+
'labels': torch.tensor(val_labels, dtype=torch.float32)
|
49 |
})
|
50 |
|
|
|
|
|
|
|
51 |
|
52 |
+
# Initialize the BERT model for multi-label classification (3 labels)
|
53 |
+
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)
|
54 |
|
|
|
|
|
|
|
|
|
55 |
|
56 |
+
# Define the training arguments
|
57 |
training_args = TrainingArguments(
|
58 |
+
output_dir='./results',
|
59 |
+
evaluation_strategy='epoch', # Evaluate after each epoch
|
60 |
learning_rate=2e-5,
|
61 |
+
per_device_train_batch_size=8,
|
62 |
per_device_eval_batch_size=8,
|
63 |
+
num_train_epochs=3,
|
64 |
weight_decay=0.01,
|
65 |
+
logging_dir='./logs',
|
66 |
logging_steps=500,
|
67 |
save_steps=1000,
|
68 |
load_best_model_at_end=True,
|
69 |
metric_for_best_model="accuracy",
|
70 |
+
do_train=True,
|
71 |
+
do_eval=True
|
72 |
)
|
73 |
|
74 |
+
|
75 |
+
# Trainer setup
|
76 |
trainer = Trainer(
|
77 |
model=model,
|
78 |
args=training_args,
|
79 |
train_dataset=train_dataset,
|
80 |
eval_dataset=val_dataset,
|
81 |
+
compute_metrics=lambda p: {
|
82 |
+
'accuracy': np.mean(np.all(p.predictions.argmax(axis=-1) == p.label_ids, axis=1))
|
83 |
+
}
|
84 |
)
|
85 |
|
86 |
# Start training
|
|
|
87 |
trainer.train()
|
88 |
|
89 |
+
|
90 |
# Evaluate the model
|
|
|
91 |
eval_results = trainer.evaluate()
|
92 |
print(f"Evaluation results: {eval_results}")
|
93 |
|
94 |
|
|
|
|
|
|
|
95 |
|
96 |
+
# Define a new conversation for testing
|
97 |
+
new_conversation = ["Hello! How can I assist you today? I just wanted to check the status of my account."]
|
98 |
+
|
99 |
+
# Tokenize the new conversation
|
100 |
+
test_encodings = tokenizer(new_conversation, truncation=True, padding=True, max_length=512, return_tensors='pt')
|
101 |
+
|
102 |
+
# Make predictions
|
103 |
+
with torch.no_grad():
|
104 |
+
model.eval()
|
105 |
+
outputs = model(**test_encodings)
|
106 |
+
predictions = torch.sigmoid(outputs.logits).cpu().numpy() # Sigmoid for multi-label classification
|
107 |
+
|
108 |
+
# Display predictions
|
109 |
+
print(f"Predictions (Rule Followed, Question Asked, Question Answered): {predictions}")
|
110 |
+
|
111 |
+
|
112 |
+
# Round predictions (since we are doing binary classification for each label)
|
113 |
+
predictions_rounded = np.round(predictions)
|
114 |
|
115 |
+
print(f"Predictions (rounded): {predictions_rounded}")
|
|
|
|
|
|
|
|