File size: 959 Bytes
eca6215
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
from datasets import load_dataset

def formatting_prompts_func(examples, template, eos_token):
    instructions = examples["instruction"]
    inputs = examples["input"]
    outputs = examples["output"]

    # Format the examples using the provided template
    texts = []
    for instruction, input_text, output in zip(instructions, inputs, outputs):
        text = template.format(instruction, input_text, output) + eos_token
        texts.append(text)

    # Return a dictionary with the formatted text
    return {"text": texts}

def load_and_prepare_dataset(dataset_name, nsamples, formatting_func, template, eos_token):
    # Load the dataset and prepare it by applying the formatting function
    dataset = load_dataset(dataset_name, split="train").select(range(nsamples))
    
    # Map the formatting function over the dataset
    return dataset.map(lambda examples: formatting_func(examples, template, eos_token), batched=True)