dan_AI / train /data.py
Jack Bielinski
hi
ed390c4
raw
history blame contribute delete
353 Bytes
from datasets import load_dataset
from transformers import tokenizer
# Load a dataset from a csv file
dataset = load_dataset('csv', data_files='data.csv')
# Tokenize the text
def tokenize_function(examples):
return tokenizer(examples["text"], padding="max_length", truncation=True)
tokenized_dataset = dataset.map(tokenize_function, batched=True)