from transformers import BertTokenizer | |
from datasets import load_dataset | |
# Load the dataset (assuming it's a CSV file) | |
dataset = load_dataset('csv', data_files='interview_data.csv', delimiter=',') | |
# Initialize tokenizer (using BERT tokenizer for example) | |
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') | |
# Tokenize the responses | |
def tokenize_function(examples): | |
return tokenizer(examples['Response'], padding='max_length', truncation=True) | |
tokenized_datasets = dataset.map(tokenize_function, batched=True) | |
# Split into train and test datasets | |
train_dataset = tokenized_datasets['train'] | |
test_dataset = tokenized_datasets['test'] | |