from datasets import load_dataset from transformers import tokenizer # Load a dataset from a csv file dataset = load_dataset('csv', data_files='data.csv') # Tokenize the text def tokenize_function(examples): return tokenizer(examples["text"], padding="max_length", truncation=True) tokenized_dataset = dataset.map(tokenize_function, batched=True)