from transformers import BertTokenizer from datasets import load_dataset # Load the dataset (assuming it's a CSV file) dataset = load_dataset('csv', data_files='interview_data.csv', delimiter=',') # Initialize tokenizer (using BERT tokenizer for example) tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') # Tokenize the responses def tokenize_function(examples): return tokenizer(examples['Response'], padding='max_length', truncation=True) tokenized_datasets = dataset.map(tokenize_function, batched=True) # Split into train and test datasets train_dataset = tokenized_datasets['train'] test_dataset = tokenized_datasets['test']