File size: 3,127 Bytes
d660b02
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import json
from pathlib import Path

import click
from loguru import logger

from llm_engineering.domain.base.nosql import NoSQLBaseDocument
from llm_engineering.domain.documents import ArticleDocument, PostDocument, RepositoryDocument, UserDocument


@click.command()
@click.option(

    "--export-raw-data",

    is_flag=True,

    default=False,

    help="Whether to export your data warehouse to a JSON file.",

)
@click.option(

    "--import-raw-data",

    is_flag=True,

    default=False,

    help="Whether to import a JSON file into your data warehouse.",

)
@click.option(

    "--data-dir",

    default=Path("data/data_warehouse_raw_data"),

    type=Path,

    help="Path to the directory containing data warehouse raw data JSON files.",

)
def main(

    export_raw_data,

    import_raw_data,

    data_dir: Path,

) -> None:
    assert export_raw_data or import_raw_data, "Specify at least one operation."

    if export_raw_data:
        __export(data_dir)

    if import_raw_data:
        __import(data_dir)


def __export(data_dir: Path) -> None:
    logger.info(f"Exporting data warehouse to {data_dir}...")
    data_dir.mkdir(parents=True, exist_ok=True)

    __export_data_category(data_dir, ArticleDocument)
    __export_data_category(data_dir, PostDocument)
    __export_data_category(data_dir, RepositoryDocument)
    __export_data_category(data_dir, UserDocument)


def __export_data_category(data_dir: Path, category_class: type[NoSQLBaseDocument]) -> None:
    data = category_class.bulk_find()
    serialized_data = [d.to_mongo() for d in data]
    export_file = data_dir / f"{category_class.__name__}.json"

    logger.info(f"Exporting {len(serialized_data)} items of {category_class.__name__} to {export_file}...")
    with export_file.open("w") as f:
        json.dump(serialized_data, f)


def __import(data_dir: Path) -> None:
    logger.info(f"Importing data warehouse from {data_dir}...")
    assert data_dir.is_dir(), f"{data_dir} is not a directory or it doesn't exists."

    data_category_classes = {
        "ArticleDocument": ArticleDocument,
        "PostDocument": PostDocument,
        "RepositoryDocument": RepositoryDocument,
        "UserDocument": UserDocument,
    }

    for file in data_dir.iterdir():
        if not file.is_file():
            continue

        category_class_name = file.stem
        category_class = data_category_classes.get(category_class_name)
        if not category_class:
            logger.warning(f"Skipping {file} as it does not match any data category.")
            continue

        __import_data_category(file, category_class)


def __import_data_category(file: Path, category_class: type[NoSQLBaseDocument]) -> None:
    with file.open("r") as f:
        data = json.load(f)

    logger.info(f"Importing {len(data)} items of {category_class.__name__} from {file}...")
    if len(data) > 0:
        deserialized_data = [category_class.from_mongo(d) for d in data]
        category_class.bulk_insert(deserialized_data)


if __name__ == "__main__":
    main()