Spaces:
Runtime error
Runtime error
import pandas as pd | |
from transformers import AutoTokenizer | |
def load_data(file_path): | |
""" | |
Load the customer support dataset from a CSV file. | |
""" | |
data = pd.read_csv(file_path) | |
return data | |
def preprocess_data(data): | |
""" | |
Preprocess data by tokenizing the instructions and responses. | |
""" | |
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-125M") | |
tokenizer.pad_token = tokenizer.eos_token | |
def tokenize_data(row): | |
""" | |
Helper function to tokenize instruction and response. | |
""" | |
instruction_tokens = tokenizer(row['instruction'], truncation=True, padding="max_length", max_length=256) | |
response_tokens = tokenizer(row['response'], truncation=True, padding="max_length", max_length=256) | |
return instruction_tokens, response_tokens | |
# Tokenize each row's instruction and response | |
data['instruction_tokens'], data['response_tokens'] = zip(*data.apply(tokenize_data, axis=1)) | |
return data[['instruction_tokens', 'response_tokens']] | |
if __name__ == "__main__": | |
data = load_data('data/raw/customer_support.csv') | |
processed_data = preprocess_data(data) | |
processed_data.to_csv('data/processed/customer_support_preprocessed.csv', index=False) | |