VenkateshRoshan
Initial Code Added
94c58a1
import pandas as pd
from transformers import AutoTokenizer
def load_data(file_path):
"""
Load the customer support dataset from a CSV file.
"""
data = pd.read_csv(file_path)
return data
def preprocess_data(data):
"""
Preprocess data by tokenizing the instructions and responses.
"""
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-125M")
tokenizer.pad_token = tokenizer.eos_token
def tokenize_data(row):
"""
Helper function to tokenize instruction and response.
"""
instruction_tokens = tokenizer(row['instruction'], truncation=True, padding="max_length", max_length=256)
response_tokens = tokenizer(row['response'], truncation=True, padding="max_length", max_length=256)
return instruction_tokens, response_tokens
# Tokenize each row's instruction and response
data['instruction_tokens'], data['response_tokens'] = zip(*data.apply(tokenize_data, axis=1))
return data[['instruction_tokens', 'response_tokens']]
if __name__ == "__main__":
data = load_data('data/raw/customer_support.csv')
processed_data = preprocess_data(data)
processed_data.to_csv('data/processed/customer_support_preprocessed.csv', index=False)