Spaces:
Runtime error
Runtime error
File size: 3,327 Bytes
b409f35 9fe41c7 b409f35 a7284f0 b409f35 5bb9c5d 9fe41c7 b409f35 5bb9c5d b409f35 5bb9c5d b409f35 9fe41c7 b409f35 9fe41c7 b409f35 9fe41c7 b409f35 9fe41c7 b409f35 9fe41c7 b409f35 9fe41c7 b409f35 9fe41c7 b409f35 0180abe b409f35 5724fbf b409f35 9fe41c7 b409f35 5bb9c5d b409f35 9fe41c7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 |
from sklearn.model_selection import train_test_split
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import pandas as pd
# Load the CSV file into a pandas DataFrame
dataset = pd.read_csv('customer_address_compliance_scenarios.csv')
# Load multilingual BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")
# Preprocess the data
# df = df.dropna() # Optional: Drop rows with missing values
#X = df.drop(columns=['target_column']) # Features
#y = df['target_column'] # Target variable
# Split dataset into training and evaluation sets
train_data, eval_data = train_test_split(dataset, test_size=0.2)
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Tokenizer function that also keeps the label in the dataset
def tokenize_function(example):
tokenized_example = tokenizer(example['customer_input'], example['agent_response'], padding='max_length', truncation=True, max_length=512)
tokenized_example['label'] = 1 if example['label'] == 'compliant' else 0 # Convert 'compliant' to 1 and 'non-compliant' to 0
return tokenized_example
# Apply tokenization to the entire dataset
train_data = [tokenize_function(x) for x in train_data]
eval_data = [tokenize_function(x) for x in eval_data]
# Dataset class
class DialogueDataset(torch.utils.data.Dataset):
def __init__(self, data):
self.data = data
self.labels = [item['label'] for item in data]
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
item = self.data[idx]
input_ids = torch.tensor(item['input_ids'])
attention_mask = torch.tensor(item['attention_mask'])
label = torch.tensor(item['label'])
return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": label}
# Create PyTorch datasets
train_dataset = DialogueDataset(train_data)
eval_dataset = DialogueDataset(eval_data)
# Load multilingual BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels=2)
# Training arguments
training_args = TrainingArguments(
output_dir="./results",
eval_strategy="epoch", # Evaluate every epoch
per_device_train_batch_size=8,
per_device_eval_batch_size=8,
num_train_epochs=2,
weight_decay=0.01,
logging_dir='./logs',
)
# Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
)
# Fine-tune the model
trainer.train()
# Evaluate the model
eval_results = trainer.evaluate()
print("Evaluation Results:", eval_results)
def check_compliance(customer_input, agent_response):
inputs = tokenizer(customer_input, agent_response, return_tensors="pt", padding=True, truncation=True, max_length=512)
with torch.no_grad():
outputs = model(**inputs)
logits = outputs.logits
predicted_class = torch.argmax(logits, dim=-1).item()
if predicted_class == 1:
return "Compliant"
else:
return "Non-Compliant"
# Test the model with new data
test_customer_input = ""
test_agent_response = "Is this your address ?"
result = check_compliance(test_customer_input, test_agent_response)
print(result)
|