axa / graphRAG /utils.py
Mayara Ayat
Upload folder using huggingface_hub
f7ab812 verified
from uuid import uuid4
from KG_classes import FileNode, ChunkNode, Property, Node, Relationship
from langchain_community.graphs.graph_document import (
Node as BaseNode,
Relationship as BaseRelationship,
)
from langchain.schema import Document
import os
from typing import List
from langchain.text_splitter import TokenTextSplitter
from langchain_community.document_loaders import TextLoader, PyPDFLoader
def env_parse(file):
credentials = {}
with open(file, "r") as f:
for line in f:
if line[0] == "#":
continue
line = line.split("=")
credentials[line[0]] = line[1].strip()
return credentials
def create_file_node(file_path: str) -> FileNode:
"""Create a file node."""
return FileNode(
id=file_path.split("/")[-1],
type="File",
properties=[
Property(key="path", value=file_path),
Property(key="name", value=os.path.basename(file_path)),
],
)
def create_chunk_node(
chunk: Document, chunk_idx: int, file_node: FileNode
) -> ChunkNode:
"""Create a chunk node."""
return ChunkNode(
id=file_node.id + str(chunk_idx),
type="Chunk",
properties=[
Property(key="content", value=chunk.page_content),
Property(key="idx", value=str(chunk_idx)),
Property(key="sourceFileId", value=file_node.id),
],
)
def format_property_key(s: str) -> str:
words = s.split()
if not words:
return s
first_word = words[0].lower()
capitalized_words = [word.capitalize() for word in words[1:]]
return "".join([first_word] + capitalized_words)
def props_to_dict(props) -> dict:
"""Convert properties to a dictionary."""
properties = {}
if not props:
return properties
for p in props:
properties[format_property_key(p.key)] = p.value
return properties
def map_to_base_node(node: Node) -> BaseNode:
"""Map the KnowledgeGraph Node to the base Node."""
if type(node.properties) == dict:
properties = node.properties
else:
properties = props_to_dict(node.properties) if node.properties else {}
# Add name property for better Cypher statement generation
properties["name"] = node.id.title()
return BaseNode(
id=node.id.title(), type=node.type.capitalize(), properties=properties
)
def map_to_base_relationship(rel: Relationship) -> BaseRelationship:
"""Map the KnowledgeGraph Relationship to the base Relationship."""
source = map_to_base_node(rel.source)
target = map_to_base_node(rel.target)
properties = props_to_dict(rel.properties) if rel.properties else {}
return BaseRelationship(
source=source, target=target, type=rel.type.capitalize(), properties=properties
)
def create_relationship(source: Node, target: Node, type: str):
source = map_to_base_node(source)
target = map_to_base_node(target)
return BaseRelationship(
source=source, target=target, type=type.capitalize(), properties={}
)
def load_and_split_documents(
file_paths: List[str], chunk_size: int = 100, chunk_overlap: int = 20
):
"""
Load and split multiple documents into chunks.
Args:
file_paths (List[str]): List of file paths to load.
chunk_size (int): Size of each chunk (in tokens).
chunk_overlap (int): Overlap between chunks (in tokens).
Returns:
List: List of split document chunks.
"""
all_pages = []
text_splitter = TokenTextSplitter(
chunk_size=chunk_size, chunk_overlap=chunk_overlap
)
for file_path in file_paths:
if file_path.endswith(".pdf"):
loader = PyPDFLoader(file_path)
else:
loader = TextLoader(file_path)
pages = loader.load_and_split()
chunks = text_splitter.split_documents(pages)
all_pages.extend(chunks)
return all_pages