File size: 1,174 Bytes
d660b02
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
from clearml import PipelineDecorator
from llm_engineering.domain.dataset import DatasetType
from steps import generate_datasets as cd_steps


@PipelineDecorator.pipeline(name="evaluating", project="CS370")
def generate_datasets(

    dataset_type: DatasetType = DatasetType.INSTRUCTION,

    test_split_size: float = 0.1,

    push_to_huggingface: bool = False,

    dataset_id: str | None = None,

    mock: bool = False,

    wait_for: str | list[str] | None = None,

) -> None:
    cleaned_documents = cd_steps.query_feature_store(after=wait_for)
    prompts = cd_steps.create_prompts(documents=cleaned_documents, dataset_type=dataset_type)
    if dataset_type == DatasetType.INSTRUCTION:
        dataset = cd_steps.generate_intruction_dataset(prompts=prompts, test_split_size=test_split_size, mock=mock)
    elif dataset_type == DatasetType.PREFERENCE:
        dataset = cd_steps.generate_preference_dataset(prompts=prompts, test_split_size=test_split_size, mock=mock)
    else:
        raise ValueError(f"Invalid dataset type: {dataset_type}")

    if push_to_huggingface:
        cd_steps.push_to_huggingface(dataset=dataset, dataset_id=dataset_id)