File size: 3,327 Bytes
b409f35
9fe41c7
b409f35
a7284f0
b409f35
5bb9c5d
 
 
 
9fe41c7
b409f35
 
5bb9c5d
 
 
 
b409f35
 
 
5bb9c5d
b409f35
9fe41c7
b409f35
9fe41c7
 
 
b409f35
9fe41c7
b409f35
 
 
 
 
 
 
9fe41c7
b409f35
 
 
 
 
 
 
 
9fe41c7
b409f35
 
9fe41c7
 
b409f35
 
 
9fe41c7
 
 
 
b409f35
 
 
0180abe
b409f35
 
5724fbf
b409f35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9fe41c7
b409f35
 
 
 
 
 
 
 
 
 
 
 
 
5bb9c5d
 
b409f35
9fe41c7
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
from sklearn.model_selection import train_test_split
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import pandas as pd


# Load the CSV file into a pandas DataFrame
dataset = pd.read_csv('customer_address_compliance_scenarios.csv')

# Load multilingual BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")

# Preprocess the data
# df = df.dropna()  # Optional: Drop rows with missing values
#X = df.drop(columns=['target_column'])  # Features
#y = df['target_column']  # Target variable

# Split dataset into training and evaluation sets
train_data, eval_data = train_test_split(dataset, test_size=0.2)
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Tokenizer function that also keeps the label in the dataset
def tokenize_function(example):
    tokenized_example = tokenizer(example['customer_input'], example['agent_response'], padding='max_length', truncation=True, max_length=512)
    tokenized_example['label'] = 1 if example['label'] == 'compliant' else 0  # Convert 'compliant' to 1 and 'non-compliant' to 0
    return tokenized_example

# Apply tokenization to the entire dataset
train_data = [tokenize_function(x) for x in train_data]
eval_data = [tokenize_function(x) for x in eval_data]

# Dataset class
class DialogueDataset(torch.utils.data.Dataset):
    def __init__(self, data):
        self.data = data
        self.labels = [item['label'] for item in data]

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        input_ids = torch.tensor(item['input_ids'])
        attention_mask = torch.tensor(item['attention_mask'])
        label = torch.tensor(item['label'])
        return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": label}


# Create PyTorch datasets
train_dataset = DialogueDataset(train_data)
eval_dataset = DialogueDataset(eval_data)


# Load multilingual BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels=2)

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",  # Evaluate every epoch
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_dir='./logs',
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

# Fine-tune the model
trainer.train()

# Evaluate the model
eval_results = trainer.evaluate()
print("Evaluation Results:", eval_results)


def check_compliance(customer_input, agent_response):
    inputs = tokenizer(customer_input, agent_response, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    predicted_class = torch.argmax(logits, dim=-1).item()

    if predicted_class == 1:
        return "Compliant"
    else:
        return "Non-Compliant"

# Test the model with new data
test_customer_input = ""
test_agent_response = "Is this your address ?"
result = check_compliance(test_customer_input, test_agent_response)
print(result)