File size: 3,955 Bytes
f7ab812
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
from uuid import uuid4
from KG_classes import FileNode, ChunkNode, Property, Node, Relationship
from langchain_community.graphs.graph_document import (
    Node as BaseNode,
    Relationship as BaseRelationship,
)
from langchain.schema import Document
import os
from typing import List
from langchain.text_splitter import TokenTextSplitter
from langchain_community.document_loaders import TextLoader, PyPDFLoader


def env_parse(file):
    credentials = {}
    with open(file, "r") as f:
        for line in f:
            if line[0] == "#":
                continue
            line = line.split("=")
            credentials[line[0]] = line[1].strip()
    return credentials


def create_file_node(file_path: str) -> FileNode:
    """Create a file node."""
    return FileNode(
        id=file_path.split("/")[-1],
        type="File",
        properties=[
            Property(key="path", value=file_path),
            Property(key="name", value=os.path.basename(file_path)),
        ],
    )


def create_chunk_node(
    chunk: Document, chunk_idx: int, file_node: FileNode
) -> ChunkNode:
    """Create a chunk node."""
    return ChunkNode(
        id=file_node.id + str(chunk_idx),
        type="Chunk",
        properties=[
            Property(key="content", value=chunk.page_content),
            Property(key="idx", value=str(chunk_idx)),
            Property(key="sourceFileId", value=file_node.id),
        ],
    )


def format_property_key(s: str) -> str:
    words = s.split()
    if not words:
        return s
    first_word = words[0].lower()
    capitalized_words = [word.capitalize() for word in words[1:]]
    return "".join([first_word] + capitalized_words)


def props_to_dict(props) -> dict:
    """Convert properties to a dictionary."""
    properties = {}
    if not props:
        return properties
    for p in props:
        properties[format_property_key(p.key)] = p.value
    return properties


def map_to_base_node(node: Node) -> BaseNode:
    """Map the KnowledgeGraph Node to the base Node."""
    if type(node.properties) == dict:
        properties = node.properties
    else:
        properties = props_to_dict(node.properties) if node.properties else {}
    # Add name property for better Cypher statement generation
    properties["name"] = node.id.title()
    return BaseNode(
        id=node.id.title(), type=node.type.capitalize(), properties=properties
    )


def map_to_base_relationship(rel: Relationship) -> BaseRelationship:
    """Map the KnowledgeGraph Relationship to the base Relationship."""
    source = map_to_base_node(rel.source)
    target = map_to_base_node(rel.target)
    properties = props_to_dict(rel.properties) if rel.properties else {}
    return BaseRelationship(
        source=source, target=target, type=rel.type.capitalize(), properties=properties
    )


def create_relationship(source: Node, target: Node, type: str):
    source = map_to_base_node(source)
    target = map_to_base_node(target)
    return BaseRelationship(
        source=source, target=target, type=type.capitalize(), properties={}
    )


def load_and_split_documents(
    file_paths: List[str], chunk_size: int = 100, chunk_overlap: int = 20
):
    """
    Load and split multiple documents into chunks.

    Args:
        file_paths (List[str]): List of file paths to load.
        chunk_size (int): Size of each chunk (in tokens).
        chunk_overlap (int): Overlap between chunks (in tokens).

    Returns:
        List: List of split document chunks.
    """
    all_pages = []
    text_splitter = TokenTextSplitter(
        chunk_size=chunk_size, chunk_overlap=chunk_overlap
    )

    for file_path in file_paths:
        if file_path.endswith(".pdf"):
            loader = PyPDFLoader(file_path)
        else:
            loader = TextLoader(file_path)

        pages = loader.load_and_split()
        chunks = text_splitter.split_documents(pages)
        all_pages.extend(chunks)

    return all_pages