Spaces:

ketanchaudhary88
/

Bert

Runtime error

App Files Files Community

ketanchaudhary88 commited on Nov 17, 2024

Commit

1432ebd

verified ·

1 Parent(s): bd4d894

Update app.py

Browse files

Files changed (1) hide show

app.py +134 -78

app.py CHANGED Viewed

@@ -3,98 +3,154 @@ import torch
 from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
 import pandas as pd
 # Load the CSV file into a pandas DataFrame
-dataset = pd.read_csv('customer_address_compliance_scenarios.csv')
-# Load multilingual BERT tokenizer
-tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")
-# Preprocess the data
-# df = df.dropna()  # Optional: Drop rows with missing values
-#X = df.drop(columns=['target_column'])  # Features
-#y = df['target_column']  # Target variable
-# Split dataset into training and evaluation sets
-train_data, eval_data = train_test_split(dataset, test_size=0.2)
-#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
-# Tokenizer function that also keeps the label in the dataset
-def tokenize_function(example):
-    tokenized_example = tokenizer(example['customer_input'], example['agent_response'], padding='max_length', truncation=True, max_length=512)
-    tokenized_example['label'] = 1 if example['label'] == 'compliant' else 0  # Convert 'compliant' to 1 and 'non-compliant' to 0
-    return tokenized_example
-# Apply tokenization to the entire dataset
-train_data = [tokenize_function(x) for x in train_data]
-eval_data = [tokenize_function(x) for x in eval_data]
-# Dataset class
-class DialogueDataset(torch.utils.data.Dataset):
-    def __init__(self, data):
-        self.data = data
-        self.labels = [item['label'] for item in data]
-    def __len__(self):
-        return len(self.data)
-    def __getitem__(self, idx):
-        item = self.data[idx]
-        input_ids = torch.tensor(item['input_ids'])
-        attention_mask = torch.tensor(item['attention_mask'])
-        label = torch.tensor(item['label'])
-        return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": label}
-# Create PyTorch datasets
-train_dataset = DialogueDataset(train_data)
-eval_dataset = DialogueDataset(eval_data)
-# Load multilingual BERT model for sequence classification
-model = BertForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels=2)
-# Training arguments
-training_args = TrainingArguments(
-    output_dir="./results",
-    eval_strategy="epoch",  # Evaluate every epoch
-    per_device_train_batch_size=8,
-    per_device_eval_batch_size=8,
-    num_train_epochs=2,
-    weight_decay=0.01,
-    logging_dir='./logs',
-)
-# Trainer
-trainer = Trainer(
-    model=model,
-    args=training_args,
-    train_dataset=train_dataset,
-    eval_dataset=eval_dataset,
-)
-# Fine-tune the model
-trainer.train()
-# Evaluate the model
-eval_results = trainer.evaluate()
-print("Evaluation Results:", eval_results)
-def check_compliance(customer_input, agent_response):
-    inputs = tokenizer(customer_input, agent_response, return_tensors="pt", padding=True, truncation=True, max_length=512)
-    with torch.no_grad():
-        outputs = model(**inputs)
     logits = outputs.logits
-    predicted_class = torch.argmax(logits, dim=-1).item()
-    if predicted_class == 1:
-        return "Compliant"
-    else:
-        return "Non-Compliant"
-# Test the model with new data
-test_customer_input = ""
-test_agent_response = "Is this your address ?"
-result = check_compliance(test_customer_input, test_agent_response)
-print(result)

 from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
 import pandas as pd
 # Load the CSV file into a pandas DataFrame
+df = pd.read_csv('conversation_data.csv')
+# Ensure that the columns are named correctly
+print(df.head())
+class ConversationDataset(Dataset):
+    def __init__(self, agent_utterances, customer_utterances, rule_followed, question_asked, question_answered):
+        self.agent_utterances = agent_utterances
+        self.customer_utterances = customer_utterances
+        self.rule_followed = rule_followed
+        self.question_asked = question_asked
+        self.question_answered = question_answered
+    def __len__(self):
+        return len(self.agent_utterances)
+    def __getitem__(self, idx):
+        agent_text = self.agent_utterances[idx]
+        customer_text = self.customer_utterances[idx]
+        # Combine both agent and customer utterances into one sequence
+        input_text = agent_text + " [SEP] " + customer_text
+        # Tokenize the input
+        inputs = tokenizer(input_text, padding='max_length', truncation=True, max_length=128, return_tensors="pt")
+        # Return inputs and labels for each task (Rule Followed, Question Asked, Question Answered)
+        labels = torch.tensor([self.rule_followed[idx], self.question_asked[idx], self.question_answered[idx]], dtype=torch.long)
+        return {**inputs, 'labels': labels}
+# Initialize the tokenizer and model for BERT
+tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)  # Multi-label classification
+# Check if GPU is available
+device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+model.to(device)
+# Extract agent utterances, customer utterances, and labels (rule_followed, question_asked, question_answered)
+agent_utterances = df['Agent Utterance'].tolist()
+customer_utterances = df['Customer Utterance'].tolist()
+rule_followed = df['Rule Followed'].tolist()
+question_asked = df['Question Asked'].tolist()
+question_answered = df['Question Answered'].tolist()
+# Split the data into training and validation sets
+X_train, X_val, y_train, y_val = train_test_split(
+    list(zip(agent_utterances, customer_utterances)),
+    list(zip(rule_followed, question_asked, question_answered)),
+    test_size=0.2, random_state=42
+)
+# Convert to individual lists
+train_agent, train_customer = zip(*X_train)
+train_rule, train_question, train_answer = zip(*y_train)
+val_agent, val_customer = zip(*X_val)
+val_rule, val_question, val_answer = zip(*y_val)
+# Create dataset objects for training and validation
+train_dataset = ConversationDataset(train_agent, train_customer, train_rule, train_question, train_answer)
+val_dataset = ConversationDataset(val_agent, val_customer, val_rule, val_question, val_answer)
+# Create DataLoader for training and validation
+train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
+val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)
+# Set optimizer
+optimizer = AdamW(model.parameters(), lr=1e-5)
+# Training loop
+for epoch in range(3):  # Training for 3 epochs
+    model.train()
+    loop = tqdm(train_loader, desc=f"Epoch {epoch + 1}")
+    for batch in loop:
+        # Move batch to device (GPU if available)
+        batch = {key: value.squeeze(1).to(device) for key, value in batch.items()}
+        # Forward pass
+        optimizer.zero_grad()
+        outputs = model(**batch)
+        # Compute loss for the multi-label classification
+        loss = outputs.loss
+        loss.backward()
+        # Step the optimizer
+        optimizer.step()
+        loop.set_postfix(loss=loss.item())
+# Evaluate the model
+model.eval()
+all_preds = []
+all_labels = []
+with torch.no_grad():
+    for batch in val_loader:
+        batch = {key: value.squeeze(1).to(device) for key, value in batch.items()}
+        outputs = model(**batch)
+        # Get predictions (we are predicting 3 labels)
+        logits = outputs.logits
+        preds = torch.argmax(logits, dim=-1)
+        all_preds.append(preds.cpu().numpy())
+        all_labels.append(batch['labels'].cpu().numpy())
+# Flatten the lists
+all_preds = [item for sublist in all_preds for item in sublist]
+all_labels = [item for sublist in all_labels for item in sublist]
+# Convert predictions and labels into multi-label format (for classification report)
+print(classification_report(all_labels, all_preds, target_names=["Rule Followed", "Question Asked", "Question Answered"]))
+# Test the model with new data
+# Example single prediction
+test_agent_utterance = "What is your account number?"
+test_customer_utterance = "12345"
+# Combine agent and customer utterance
+input_text = test_agent_utterance + " [SEP] " + test_customer_utterance
+# Tokenize the input
+inputs = tokenizer(input_text, return_tensors="pt", padding='max_length', truncation=True, max_length=128)
+# Move to the correct device (GPU or CPU)
+inputs = {key: value.to(device) for key, value in inputs.items()}
+# Predict using the model
+model.eval()
+with torch.no_grad():
+    outputs = model(**inputs)
     logits = outputs.logits
+    preds = torch.argmax(logits, dim=-1)
+# Display the prediction for each label
+print(f"Rule Followed: {preds[0][0].item()}")
+print(f"Question Asked: {preds[0][1].item()}")
+print(f"Question Answered: {preds[0][2].item()}")