File size: 4,473 Bytes
7bd11ed
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
import hashlib
import urllib
import uuid
from pathlib import Path
from typing import List, Tuple

import pandas as pd
from loguru import logger

from app.config.models.configs import Document, Config
from app.parsers.markdown import markdown_splitter

HASH_BLOCKSIZE = 65536


class DocumentSplitter:
    def __init__(self, config: Config) -> None:

        self.document_path_settings = config.embeddings.document_settings
        self.chunk_sizes = config.embeddings.chunk_sizes

    def split(
            self,
            limit: int = None,
    ) -> Tuple[List[Document], pd.DataFrame, pd.DataFrame]:
        all_docs = []
        hash_filename_mappings = []
        hash_docid_mappings = []

        for setting in self.document_path_settings:
            passage_prefix = setting.passage_prefix
            docs_path = Path(setting.doc_path)

            extension = "md"
            for chunk_size in self.chunk_sizes:
                paths = [p for p in list(docs_path.glob(f"**/*.{extension}"))]

                additional_parser_settings = setting.additional_parser_settings.get(
                    extension, dict()
                )

                (
                    docs,
                    hf_mappings,
                    hd_mappings,
                ) = self._get_documents_from_custom_splitter(
                    document_paths=paths,
                    splitter_func=markdown_splitter,
                    max_size=chunk_size,
                    passage_prefix=passage_prefix,
                    **additional_parser_settings,
                )

                all_docs.extend(docs)
                hash_filename_mappings.extend(hf_mappings)
                hash_docid_mappings.extend(hd_mappings)

        all_hash_filename_mappings = pd.DataFrame(hash_filename_mappings)
        all_hash_docid_mappings = pd.concat(hash_docid_mappings, axis=0)

        if limit:
            all_docs = all_docs[:limit]
            all_hash_filename_mappings = all_hash_filename_mappings[:limit]
            all_hash_docid_mappings = all_hash_docid_mappings[:limit]

        return all_docs, all_hash_filename_mappings, all_hash_docid_mappings

    def _get_documents_from_custom_splitter(
            self,
            document_paths: List[Path],
            splitter_func,
            max_size,
            passage_prefix: str,
            **additional_kwargs,
    ) -> Tuple[List[Document], List[dict], List[pd.DataFrame]]:
        all_docs = []
        hash_filename_mappings = []
        hash_docid_mappings = []

        for path in document_paths:

            filepath = str(path)

            filename = filepath.split("/")[-1].replace(f".{path.suffix}", "")

            if path.suffix != ".md":
                continue

            additional_kwargs.update({"filename": filepath})
            docs_data = splitter_func(path, max_size, **additional_kwargs)
            file_hash = get_md5_hash(path)

            path = urllib.parse.quote(str(path))  # type: ignore
            logger.info(path)

            docs = [
                Document(
                    page_content=passage_prefix + d["text"],
                    metadata={
                        **d["metadata"],
                        **{
                            "source": str(path),
                            "chunk_size": max_size,
                            "document_id": str(uuid.uuid1()),
                            "label": filename,
                        },
                    },
                )
                for d in docs_data
            ]

            for d in docs:
                if 'page' in d.metadata and d.metadata['page'] is None:
                    d.metadata['page'] = -1

            all_docs.extend(docs)

            hash_filename_mappings.append(dict(filename=filepath, filehash=file_hash))

            df_hash_docid = (
                pd.DataFrame()
                .assign(docid=[d.metadata["document_id"] for d in docs])
                .assign(filehash=file_hash)
            )

            hash_docid_mappings.append(df_hash_docid)

        logger.info(f"Got {len(all_docs)} nodes.")
        return all_docs, hash_filename_mappings, hash_docid_mappings


def get_md5_hash(file_path: Path) -> str:
    hasher = hashlib.md5()

    with open(file_path, "rb") as file:
        buf = file.read(HASH_BLOCKSIZE)
        while buf:
            hasher.update(buf)
            buf = file.read(HASH_BLOCKSIZE)

    return hasher.hexdigest()