Spaces:

ketanchaudhary88
/

Bert

Runtime error

App Files Files Community

Bert / app.py

ketanchaudhary88

Update app.py

0d402f5 verified 8 months ago

raw

history blame contribute delete

3.9 kB

	import torch
	import pandas as pd
	from sklearn.model_selection import train_test_split
	from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
	from datasets import Dataset
	import numpy as np

	# Load your CSV file into a pandas DataFrame
	df = pd.read_csv("dishTV_customer_service_with_address_and_rules_accurate_v2.csv")

	# Print column names and first few rows to ensure data structure
	print(df.columns)
	print(df.head())

	# Create a conversation column by merging the agent's and customer's utterances
	df['Conversation'] = df['Agent Utterance'] + " " + df['Customer Utterance']

	# Map labels for classification (Rule Followed, Question Asked, Question Answered)
	df['Rule Followed'] = df['Rule Followed'].map({'Yes': 1, 'No': 0})
	df['Question Asked'] = df['Question Asked'].map({'Yes': 1, 'No': 0})
	df['Question Answered'] = df['Question Answered'].map({'Yes': 1, 'No': 0})

	# Split data into training and validation sets
	train_texts, val_texts, train_labels, val_labels = train_test_split(
	df['Conversation'].tolist(),
	df[['Rule Followed', 'Question Asked', 'Question Answered']].values,
	test_size=0.2
	)


	# Initialize BERT tokenizer
	tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

	# Tokenize the conversations
	train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
	val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=128)

	# Create PyTorch datasets
	train_dataset = Dataset.from_dict({
	'input_ids': train_encodings['input_ids'],
	'attention_mask': train_encodings['attention_mask'],
	'labels': torch.tensor(train_labels, dtype=torch.float32)
	})

	val_dataset = Dataset.from_dict({
	'input_ids': val_encodings['input_ids'],
	'attention_mask': val_encodings['attention_mask'],
	'labels': torch.tensor(val_labels, dtype=torch.float32)
	})


	# Initialize the BERT model for multi-label classification (3 labels)
	model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)


	# Define the training arguments
	training_args = TrainingArguments(
	output_dir='./results',
	eval_strategy='epoch', # Evaluate at the end of each epoch
	save_strategy='epoch', # Save model at the end of each epoch
	learning_rate=2e-5,
	per_device_train_batch_size=32,
	per_device_eval_batch_size=32,
	num_train_epochs=3,
	weight_decay=0.01,
	logging_dir='./logs',
	logging_steps=500,
	save_steps=1000, # Optional, you can keep this if you want to save every N steps (only used if save_strategy is 'steps')
	load_best_model_at_end=True,
	metric_for_best_model="accuracy",
	do_train=True,
	do_eval=True
	)



	# Trainer setup
	trainer = Trainer(
	model=model,
	args=training_args,
	train_dataset=train_dataset,
	eval_dataset=val_dataset,
	compute_metrics=lambda p: {
	'accuracy': np.mean(np.all(p.predictions.argmax(axis=-1) == p.label_ids, axis=1))
	}
	)

	# Start training
	trainer.train()


	# Evaluate the model
	eval_results = trainer.evaluate()
	print(f"Evaluation results: {eval_results}")



	# Define a new conversation for testing
	new_conversation = ["Hello! How can I assist you today? I just wanted to check the status of my account."]

	# Tokenize the new conversation
	test_encodings = tokenizer(new_conversation, truncation=True, padding=True, max_length=512, return_tensors='pt')

	# Make predictions
	with torch.no_grad():
	model.eval()
	outputs = model(**test_encodings)
	predictions = torch.sigmoid(outputs.logits).cpu().numpy() # Sigmoid for multi-label classification

	# Display predictions
	print(f"Predictions (Rule Followed, Question Asked, Question Answered): {predictions}")


	# Round predictions (since we are doing binary classification for each label)
	predictions_rounded = np.round(predictions)

	print(f"Predictions (rounded): {predictions_rounded}")