Spaces:

abven
/

Customer-Support-Chatbot

Runtime error

Customer-Support-Chatbot / src /preprocess.py

VenkateshRoshan

Initial Code Added

94c58a1 5 months ago

1.25 kB

	import pandas as pd
	from transformers import AutoTokenizer

	def load_data(file_path):
	"""
	Load the customer support dataset from a CSV file.
	"""
	data = pd.read_csv(file_path)
	return data

	def preprocess_data(data):
	"""
	Preprocess data by tokenizing the instructions and responses.
	"""
	tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-125M")
	tokenizer.pad_token = tokenizer.eos_token

	def tokenize_data(row):
	"""
	Helper function to tokenize instruction and response.
	"""
	instruction_tokens = tokenizer(row['instruction'], truncation=True, padding="max_length", max_length=256)
	response_tokens = tokenizer(row['response'], truncation=True, padding="max_length", max_length=256)
	return instruction_tokens, response_tokens

	# Tokenize each row's instruction and response
	data['instruction_tokens'], data['response_tokens'] = zip(*data.apply(tokenize_data, axis=1))
	return data[['instruction_tokens', 'response_tokens']]

	if __name__ == "__main__":
	data = load_data('data/raw/customer_support.csv')
	processed_data = preprocess_data(data)
	processed_data.to_csv('data/processed/customer_support_preprocessed.csv', index=False)