File size: 652 Bytes
86c3a1d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 |
from transformers import BertTokenizer
from datasets import load_dataset
# Load the dataset (assuming it's a CSV file)
dataset = load_dataset('csv', data_files='interview_data.csv', delimiter=',')
# Initialize tokenizer (using BERT tokenizer for example)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# Tokenize the responses
def tokenize_function(examples):
return tokenizer(examples['Response'], padding='max_length', truncation=True)
tokenized_datasets = dataset.map(tokenize_function, batched=True)
# Split into train and test datasets
train_dataset = tokenized_datasets['train']
test_dataset = tokenized_datasets['test']
|