File size: 3,955 Bytes
f7ab812 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 |
from uuid import uuid4
from KG_classes import FileNode, ChunkNode, Property, Node, Relationship
from langchain_community.graphs.graph_document import (
Node as BaseNode,
Relationship as BaseRelationship,
)
from langchain.schema import Document
import os
from typing import List
from langchain.text_splitter import TokenTextSplitter
from langchain_community.document_loaders import TextLoader, PyPDFLoader
def env_parse(file):
credentials = {}
with open(file, "r") as f:
for line in f:
if line[0] == "#":
continue
line = line.split("=")
credentials[line[0]] = line[1].strip()
return credentials
def create_file_node(file_path: str) -> FileNode:
"""Create a file node."""
return FileNode(
id=file_path.split("/")[-1],
type="File",
properties=[
Property(key="path", value=file_path),
Property(key="name", value=os.path.basename(file_path)),
],
)
def create_chunk_node(
chunk: Document, chunk_idx: int, file_node: FileNode
) -> ChunkNode:
"""Create a chunk node."""
return ChunkNode(
id=file_node.id + str(chunk_idx),
type="Chunk",
properties=[
Property(key="content", value=chunk.page_content),
Property(key="idx", value=str(chunk_idx)),
Property(key="sourceFileId", value=file_node.id),
],
)
def format_property_key(s: str) -> str:
words = s.split()
if not words:
return s
first_word = words[0].lower()
capitalized_words = [word.capitalize() for word in words[1:]]
return "".join([first_word] + capitalized_words)
def props_to_dict(props) -> dict:
"""Convert properties to a dictionary."""
properties = {}
if not props:
return properties
for p in props:
properties[format_property_key(p.key)] = p.value
return properties
def map_to_base_node(node: Node) -> BaseNode:
"""Map the KnowledgeGraph Node to the base Node."""
if type(node.properties) == dict:
properties = node.properties
else:
properties = props_to_dict(node.properties) if node.properties else {}
# Add name property for better Cypher statement generation
properties["name"] = node.id.title()
return BaseNode(
id=node.id.title(), type=node.type.capitalize(), properties=properties
)
def map_to_base_relationship(rel: Relationship) -> BaseRelationship:
"""Map the KnowledgeGraph Relationship to the base Relationship."""
source = map_to_base_node(rel.source)
target = map_to_base_node(rel.target)
properties = props_to_dict(rel.properties) if rel.properties else {}
return BaseRelationship(
source=source, target=target, type=rel.type.capitalize(), properties=properties
)
def create_relationship(source: Node, target: Node, type: str):
source = map_to_base_node(source)
target = map_to_base_node(target)
return BaseRelationship(
source=source, target=target, type=type.capitalize(), properties={}
)
def load_and_split_documents(
file_paths: List[str], chunk_size: int = 100, chunk_overlap: int = 20
):
"""
Load and split multiple documents into chunks.
Args:
file_paths (List[str]): List of file paths to load.
chunk_size (int): Size of each chunk (in tokens).
chunk_overlap (int): Overlap between chunks (in tokens).
Returns:
List: List of split document chunks.
"""
all_pages = []
text_splitter = TokenTextSplitter(
chunk_size=chunk_size, chunk_overlap=chunk_overlap
)
for file_path in file_paths:
if file_path.endswith(".pdf"):
loader = PyPDFLoader(file_path)
else:
loader = TextLoader(file_path)
pages = loader.load_and_split()
chunks = text_splitter.split_documents(pages)
all_pages.extend(chunks)
return all_pages
|