File size: 725 Bytes
d660b02
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
from clearml import PipelineDecorator, Task


from steps import feature_engineering as fe_steps


@PipelineDecorator.pipeline(name="feature_engineering", project="CS370")
def feature_engineering(author_full_names: list[str], wait_for: str | list[str] | None = None, **kwargs) -> list[str]:

    raw_documents = fe_steps.query_data_warehouse(author_full_names, after=wait_for)

    cleaned_documents = fe_steps.clean_documents(raw_documents)
    last_step_1 = fe_steps.load_to_vector_db(cleaned_documents)

    embedded_documents = fe_steps.chunk_and_embed(cleaned_documents)
    last_step_2 = fe_steps.load_to_vector_db(embedded_documents)

    return [last_step_1.invocation_id, last_step_2.invocation_id]