|
from uuid import uuid4 |
|
from KG_classes import FileNode, ChunkNode, Property, Node, Relationship |
|
from langchain_community.graphs.graph_document import ( |
|
Node as BaseNode, |
|
Relationship as BaseRelationship, |
|
) |
|
from langchain.schema import Document |
|
import os |
|
from typing import List |
|
from langchain.text_splitter import TokenTextSplitter |
|
from langchain_community.document_loaders import TextLoader, PyPDFLoader |
|
|
|
|
|
def env_parse(file): |
|
credentials = {} |
|
with open(file, "r") as f: |
|
for line in f: |
|
if line[0] == "#": |
|
continue |
|
line = line.split("=") |
|
credentials[line[0]] = line[1].strip() |
|
return credentials |
|
|
|
|
|
def create_file_node(file_path: str) -> FileNode: |
|
"""Create a file node.""" |
|
return FileNode( |
|
id=file_path.split("/")[-1], |
|
type="File", |
|
properties=[ |
|
Property(key="path", value=file_path), |
|
Property(key="name", value=os.path.basename(file_path)), |
|
], |
|
) |
|
|
|
|
|
def create_chunk_node( |
|
chunk: Document, chunk_idx: int, file_node: FileNode |
|
) -> ChunkNode: |
|
"""Create a chunk node.""" |
|
return ChunkNode( |
|
id=file_node.id + str(chunk_idx), |
|
type="Chunk", |
|
properties=[ |
|
Property(key="content", value=chunk.page_content), |
|
Property(key="idx", value=str(chunk_idx)), |
|
Property(key="sourceFileId", value=file_node.id), |
|
], |
|
) |
|
|
|
|
|
def format_property_key(s: str) -> str: |
|
words = s.split() |
|
if not words: |
|
return s |
|
first_word = words[0].lower() |
|
capitalized_words = [word.capitalize() for word in words[1:]] |
|
return "".join([first_word] + capitalized_words) |
|
|
|
|
|
def props_to_dict(props) -> dict: |
|
"""Convert properties to a dictionary.""" |
|
properties = {} |
|
if not props: |
|
return properties |
|
for p in props: |
|
properties[format_property_key(p.key)] = p.value |
|
return properties |
|
|
|
|
|
def map_to_base_node(node: Node) -> BaseNode: |
|
"""Map the KnowledgeGraph Node to the base Node.""" |
|
if type(node.properties) == dict: |
|
properties = node.properties |
|
else: |
|
properties = props_to_dict(node.properties) if node.properties else {} |
|
|
|
properties["name"] = node.id.title() |
|
return BaseNode( |
|
id=node.id.title(), type=node.type.capitalize(), properties=properties |
|
) |
|
|
|
|
|
def map_to_base_relationship(rel: Relationship) -> BaseRelationship: |
|
"""Map the KnowledgeGraph Relationship to the base Relationship.""" |
|
source = map_to_base_node(rel.source) |
|
target = map_to_base_node(rel.target) |
|
properties = props_to_dict(rel.properties) if rel.properties else {} |
|
return BaseRelationship( |
|
source=source, target=target, type=rel.type.capitalize(), properties=properties |
|
) |
|
|
|
|
|
def create_relationship(source: Node, target: Node, type: str): |
|
source = map_to_base_node(source) |
|
target = map_to_base_node(target) |
|
return BaseRelationship( |
|
source=source, target=target, type=type.capitalize(), properties={} |
|
) |
|
|
|
|
|
def load_and_split_documents( |
|
file_paths: List[str], chunk_size: int = 100, chunk_overlap: int = 20 |
|
): |
|
""" |
|
Load and split multiple documents into chunks. |
|
|
|
Args: |
|
file_paths (List[str]): List of file paths to load. |
|
chunk_size (int): Size of each chunk (in tokens). |
|
chunk_overlap (int): Overlap between chunks (in tokens). |
|
|
|
Returns: |
|
List: List of split document chunks. |
|
""" |
|
all_pages = [] |
|
text_splitter = TokenTextSplitter( |
|
chunk_size=chunk_size, chunk_overlap=chunk_overlap |
|
) |
|
|
|
for file_path in file_paths: |
|
if file_path.endswith(".pdf"): |
|
loader = PyPDFLoader(file_path) |
|
else: |
|
loader = TextLoader(file_path) |
|
|
|
pages = loader.load_and_split() |
|
chunks = text_splitter.split_documents(pages) |
|
all_pages.extend(chunks) |
|
|
|
return all_pages |
|
|