File size: 1,249 Bytes
94c58a1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
import pandas as pd
from transformers import AutoTokenizer

def load_data(file_path):
    """
    Load the customer support dataset from a CSV file.
    """
    data = pd.read_csv(file_path)
    return data

def preprocess_data(data):
    """
    Preprocess data by tokenizing the instructions and responses.
    """
    tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-125M")
    tokenizer.pad_token = tokenizer.eos_token

    def tokenize_data(row):
        """
        Helper function to tokenize instruction and response.
        """
        instruction_tokens = tokenizer(row['instruction'], truncation=True, padding="max_length", max_length=256)
        response_tokens = tokenizer(row['response'], truncation=True, padding="max_length", max_length=256)
        return instruction_tokens, response_tokens

    # Tokenize each row's instruction and response
    data['instruction_tokens'], data['response_tokens'] = zip(*data.apply(tokenize_data, axis=1))
    return data[['instruction_tokens', 'response_tokens']]

if __name__ == "__main__":
    data = load_data('data/raw/customer_support.csv')
    processed_data = preprocess_data(data)
    processed_data.to_csv('data/processed/customer_support_preprocessed.csv', index=False)