Spaces:
Build error
Build error
File size: 1,776 Bytes
d660b02 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 |
from typing import Any
from typing_extensions import Annotated
from clearml import PipelineDecorator
from llm_engineering.application.dataset import generation
from llm_engineering.domain.dataset import DatasetType, InstructTrainTestSplit
from llm_engineering.domain.prompt import GenerateDatasetSamplesPrompt
from llm_engineering.domain.types import DataCategory
@PipelineDecorator.component(name="generate_intruction_dataset")
def generate_intruction_dataset(
prompts: Annotated[dict[DataCategory, list[GenerateDatasetSamplesPrompt]], "prompts"],
test_split_size: Annotated[float, "test_split_size"],
mock: Annotated[bool, "mock_generation"] = False,
) -> Annotated[
InstructTrainTestSplit,
None
]:
dataset_generator = generation.get_dataset_generator(DatasetType.INSTRUCTION)
datasets = dataset_generator.generate(prompts, test_size=test_split_size, mock=mock)
#step_context = get_step_context()
#step_context.add_output_metadata(output_name="instruct_datasets", metadata=_get_metadata_instruct_dataset(datasets))
return datasets
def _get_metadata_instruct_dataset(datasets: InstructTrainTestSplit) -> dict[str, Any]:
instruct_dataset_categories = list(datasets.train.keys())
train_num_samples = {
category: instruct_dataset.num_samples for category, instruct_dataset in datasets.train.items()
}
test_num_samples = {category: instruct_dataset.num_samples for category, instruct_dataset in datasets.test.items()}
return {
"data_categories": instruct_dataset_categories,
"test_split_size": datasets.test_split_size,
"train_num_samples_per_category": train_num_samples,
"test_num_samples_per_category": test_num_samples,
}
|