File size: 957 Bytes
d660b02
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
from loguru import logger
from typing_extensions import Annotated
from clearml import PipelineDecorator

from llm_engineering.domain.dataset import InstructTrainTestSplit, PreferenceTrainTestSplit
from llm_engineering.settings import settings


@PipelineDecorator.component(name="push_to_huggingface")

def push_to_huggingface(

    dataset: Annotated[InstructTrainTestSplit | PreferenceTrainTestSplit, "dataset_split"],

    dataset_id: Annotated[str, "dataset_id"],

) -> None:
    assert dataset_id is not None, "Dataset id must be provided for pushing to Huggingface"
    assert (
        settings.HUGGINGFACE_ACCESS_TOKEN is not None
    ), "Huggingface access token must be provided for pushing to Huggingface"

    logger.info(f"Pushing dataset {dataset_id} to Hugging Face.")

    huggingface_dataset = dataset.to_huggingface(flatten=True)
    huggingface_dataset.push_to_hub(dataset_id, token=settings.HUGGINGFACE_ACCESS_TOKEN)