Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -3,98 +3,154 @@ import torch
|
|
3 |
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
|
4 |
import pandas as pd
|
5 |
|
6 |
-
|
7 |
# Load the CSV file into a pandas DataFrame
|
8 |
-
|
9 |
|
10 |
-
#
|
11 |
-
|
12 |
|
13 |
-
# Preprocess the data
|
14 |
-
# df = df.dropna() # Optional: Drop rows with missing values
|
15 |
-
#X = df.drop(columns=['target_column']) # Features
|
16 |
-
#y = df['target_column'] # Target variable
|
17 |
|
18 |
-
|
19 |
-
|
20 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
|
22 |
-
#
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
|
28 |
-
# Apply tokenization to the entire dataset
|
29 |
-
train_data = [tokenize_function(x) for x in train_data]
|
30 |
-
eval_data = [tokenize_function(x) for x in eval_data]
|
31 |
|
32 |
-
# Dataset class
|
33 |
-
class DialogueDataset(torch.utils.data.Dataset):
|
34 |
-
def __init__(self, data):
|
35 |
-
self.data = data
|
36 |
-
self.labels = [item['label'] for item in data]
|
37 |
|
38 |
-
|
39 |
-
|
|
|
|
|
40 |
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
# Load multilingual BERT model for sequence classification
|
55 |
-
model = BertForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels=2)
|
56 |
-
|
57 |
-
# Training arguments
|
58 |
-
training_args = TrainingArguments(
|
59 |
-
output_dir="./results",
|
60 |
-
eval_strategy="epoch", # Evaluate every epoch
|
61 |
-
per_device_train_batch_size=8,
|
62 |
-
per_device_eval_batch_size=8,
|
63 |
-
num_train_epochs=2,
|
64 |
-
weight_decay=0.01,
|
65 |
-
logging_dir='./logs',
|
66 |
-
)
|
67 |
|
68 |
-
#
|
69 |
-
|
70 |
-
|
71 |
-
args=training_args,
|
72 |
-
train_dataset=train_dataset,
|
73 |
-
eval_dataset=eval_dataset,
|
74 |
-
)
|
75 |
|
76 |
-
#
|
77 |
-
|
78 |
|
79 |
-
# Evaluate the model
|
80 |
-
eval_results = trainer.evaluate()
|
81 |
-
print("Evaluation Results:", eval_results)
|
82 |
|
83 |
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
88 |
logits = outputs.logits
|
89 |
-
|
90 |
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
|
96 |
-
# Test the model with new data
|
97 |
-
test_customer_input = ""
|
98 |
-
test_agent_response = "Is this your address ?"
|
99 |
-
result = check_compliance(test_customer_input, test_agent_response)
|
100 |
-
print(result)
|
|
|
3 |
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
|
4 |
import pandas as pd
|
5 |
|
|
|
6 |
# Load the CSV file into a pandas DataFrame
|
7 |
+
df = pd.read_csv('conversation_data.csv')
|
8 |
|
9 |
+
# Ensure that the columns are named correctly
|
10 |
+
print(df.head())
|
11 |
|
|
|
|
|
|
|
|
|
12 |
|
13 |
+
class ConversationDataset(Dataset):
|
14 |
+
def __init__(self, agent_utterances, customer_utterances, rule_followed, question_asked, question_answered):
|
15 |
+
self.agent_utterances = agent_utterances
|
16 |
+
self.customer_utterances = customer_utterances
|
17 |
+
self.rule_followed = rule_followed
|
18 |
+
self.question_asked = question_asked
|
19 |
+
self.question_answered = question_answered
|
20 |
+
|
21 |
+
def __len__(self):
|
22 |
+
return len(self.agent_utterances)
|
23 |
+
|
24 |
+
def __getitem__(self, idx):
|
25 |
+
agent_text = self.agent_utterances[idx]
|
26 |
+
customer_text = self.customer_utterances[idx]
|
27 |
+
|
28 |
+
# Combine both agent and customer utterances into one sequence
|
29 |
+
input_text = agent_text + " [SEP] " + customer_text
|
30 |
+
|
31 |
+
# Tokenize the input
|
32 |
+
inputs = tokenizer(input_text, padding='max_length', truncation=True, max_length=128, return_tensors="pt")
|
33 |
+
|
34 |
+
# Return inputs and labels for each task (Rule Followed, Question Asked, Question Answered)
|
35 |
+
labels = torch.tensor([self.rule_followed[idx], self.question_asked[idx], self.question_answered[idx]], dtype=torch.long)
|
36 |
+
return {**inputs, 'labels': labels}
|
37 |
+
|
38 |
+
|
39 |
+
# Initialize the tokenizer and model for BERT
|
40 |
+
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
|
41 |
+
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3) # Multi-label classification
|
42 |
+
|
43 |
+
# Check if GPU is available
|
44 |
+
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
|
45 |
+
model.to(device)
|
46 |
+
|
47 |
+
|
48 |
+
# Extract agent utterances, customer utterances, and labels (rule_followed, question_asked, question_answered)
|
49 |
+
agent_utterances = df['Agent Utterance'].tolist()
|
50 |
+
customer_utterances = df['Customer Utterance'].tolist()
|
51 |
+
rule_followed = df['Rule Followed'].tolist()
|
52 |
+
question_asked = df['Question Asked'].tolist()
|
53 |
+
question_answered = df['Question Answered'].tolist()
|
54 |
+
|
55 |
+
# Split the data into training and validation sets
|
56 |
+
X_train, X_val, y_train, y_val = train_test_split(
|
57 |
+
list(zip(agent_utterances, customer_utterances)),
|
58 |
+
list(zip(rule_followed, question_asked, question_answered)),
|
59 |
+
test_size=0.2, random_state=42
|
60 |
+
)
|
61 |
|
62 |
+
# Convert to individual lists
|
63 |
+
train_agent, train_customer = zip(*X_train)
|
64 |
+
train_rule, train_question, train_answer = zip(*y_train)
|
65 |
+
|
66 |
+
val_agent, val_customer = zip(*X_val)
|
67 |
+
val_rule, val_question, val_answer = zip(*y_val)
|
68 |
+
|
69 |
+
# Create dataset objects for training and validation
|
70 |
+
train_dataset = ConversationDataset(train_agent, train_customer, train_rule, train_question, train_answer)
|
71 |
+
val_dataset = ConversationDataset(val_agent, val_customer, val_rule, val_question, val_answer)
|
72 |
+
|
73 |
+
# Create DataLoader for training and validation
|
74 |
+
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
|
75 |
+
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)
|
76 |
+
|
77 |
+
|
78 |
+
# Set optimizer
|
79 |
+
optimizer = AdamW(model.parameters(), lr=1e-5)
|
80 |
+
|
81 |
+
# Training loop
|
82 |
+
for epoch in range(3): # Training for 3 epochs
|
83 |
+
model.train()
|
84 |
+
loop = tqdm(train_loader, desc=f"Epoch {epoch + 1}")
|
85 |
+
|
86 |
+
for batch in loop:
|
87 |
+
# Move batch to device (GPU if available)
|
88 |
+
batch = {key: value.squeeze(1).to(device) for key, value in batch.items()}
|
89 |
+
|
90 |
+
# Forward pass
|
91 |
+
optimizer.zero_grad()
|
92 |
+
outputs = model(**batch)
|
93 |
+
|
94 |
+
# Compute loss for the multi-label classification
|
95 |
+
loss = outputs.loss
|
96 |
+
loss.backward()
|
97 |
+
|
98 |
+
# Step the optimizer
|
99 |
+
optimizer.step()
|
100 |
+
|
101 |
+
loop.set_postfix(loss=loss.item())
|
102 |
|
|
|
|
|
|
|
103 |
|
|
|
|
|
|
|
|
|
|
|
104 |
|
105 |
+
# Evaluate the model
|
106 |
+
model.eval()
|
107 |
+
all_preds = []
|
108 |
+
all_labels = []
|
109 |
|
110 |
+
with torch.no_grad():
|
111 |
+
for batch in val_loader:
|
112 |
+
batch = {key: value.squeeze(1).to(device) for key, value in batch.items()}
|
113 |
+
outputs = model(**batch)
|
114 |
+
|
115 |
+
# Get predictions (we are predicting 3 labels)
|
116 |
+
logits = outputs.logits
|
117 |
+
preds = torch.argmax(logits, dim=-1)
|
118 |
+
|
119 |
+
all_preds.append(preds.cpu().numpy())
|
120 |
+
all_labels.append(batch['labels'].cpu().numpy())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
121 |
|
122 |
+
# Flatten the lists
|
123 |
+
all_preds = [item for sublist in all_preds for item in sublist]
|
124 |
+
all_labels = [item for sublist in all_labels for item in sublist]
|
|
|
|
|
|
|
|
|
125 |
|
126 |
+
# Convert predictions and labels into multi-label format (for classification report)
|
127 |
+
print(classification_report(all_labels, all_preds, target_names=["Rule Followed", "Question Asked", "Question Answered"]))
|
128 |
|
|
|
|
|
|
|
129 |
|
130 |
|
131 |
+
# Test the model with new data
|
132 |
+
# Example single prediction
|
133 |
+
test_agent_utterance = "What is your account number?"
|
134 |
+
test_customer_utterance = "12345"
|
135 |
+
|
136 |
+
# Combine agent and customer utterance
|
137 |
+
input_text = test_agent_utterance + " [SEP] " + test_customer_utterance
|
138 |
+
|
139 |
+
# Tokenize the input
|
140 |
+
inputs = tokenizer(input_text, return_tensors="pt", padding='max_length', truncation=True, max_length=128)
|
141 |
+
|
142 |
+
# Move to the correct device (GPU or CPU)
|
143 |
+
inputs = {key: value.to(device) for key, value in inputs.items()}
|
144 |
+
|
145 |
+
# Predict using the model
|
146 |
+
model.eval()
|
147 |
+
with torch.no_grad():
|
148 |
+
outputs = model(**inputs)
|
149 |
logits = outputs.logits
|
150 |
+
preds = torch.argmax(logits, dim=-1)
|
151 |
|
152 |
+
# Display the prediction for each label
|
153 |
+
print(f"Rule Followed: {preds[0][0].item()}")
|
154 |
+
print(f"Question Asked: {preds[0][1].item()}")
|
155 |
+
print(f"Question Answered: {preds[0][2].item()}")
|
156 |
|
|
|
|
|
|
|
|
|
|