Spaces:

macota1
/

axa

Runtime error

axa

File size: 3,955 Bytes

f7ab812

from uuid import uuid4
from KG_classes import FileNode, ChunkNode, Property, Node, Relationship
from langchain_community.graphs.graph_document import (
    Node as BaseNode,
    Relationship as BaseRelationship,
)
from langchain.schema import Document
import os
from typing import List
from langchain.text_splitter import TokenTextSplitter
from langchain_community.document_loaders import TextLoader, PyPDFLoader


def env_parse(file):
    credentials = {}
    with open(file, "r") as f:
        for line in f:
            if line[0] == "#":
                continue
            line = line.split("=")
            credentials[line[0]] = line[1].strip()
    return credentials


def create_file_node(file_path: str) -> FileNode:
    """Create a file node."""
    return FileNode(
        id=file_path.split("/")[-1],
        type="File",
        properties=[
            Property(key="path", value=file_path),
            Property(key="name", value=os.path.basename(file_path)),
        ],
    )


def create_chunk_node(
    chunk: Document, chunk_idx: int, file_node: FileNode
) -> ChunkNode:
    """Create a chunk node."""
    return ChunkNode(
        id=file_node.id + str(chunk_idx),
        type="Chunk",
        properties=[
            Property(key="content", value=chunk.page_content),
            Property(key="idx", value=str(chunk_idx)),
            Property(key="sourceFileId", value=file_node.id),
        ],
    )


def format_property_key(s: str) -> str:
    words = s.split()
    if not words:
        return s
    first_word = words[0].lower()
    capitalized_words = [word.capitalize() for word in words[1:]]
    return "".join([first_word] + capitalized_words)


def props_to_dict(props) -> dict:
    """Convert properties to a dictionary."""
    properties = {}
    if not props:
        return properties
    for p in props:
        properties[format_property_key(p.key)] = p.value
    return properties


def map_to_base_node(node: Node) -> BaseNode:
    """Map the KnowledgeGraph Node to the base Node."""
    if type(node.properties) == dict:
        properties = node.properties
    else:
        properties = props_to_dict(node.properties) if node.properties else {}
    # Add name property for better Cypher statement generation
    properties["name"] = node.id.title()
    return BaseNode(
        id=node.id.title(), type=node.type.capitalize(), properties=properties
    )


def map_to_base_relationship(rel: Relationship) -> BaseRelationship:
    """Map the KnowledgeGraph Relationship to the base Relationship."""
    source = map_to_base_node(rel.source)
    target = map_to_base_node(rel.target)
    properties = props_to_dict(rel.properties) if rel.properties else {}
    return BaseRelationship(
        source=source, target=target, type=rel.type.capitalize(), properties=properties
    )


def create_relationship(source: Node, target: Node, type: str):
    source = map_to_base_node(source)
    target = map_to_base_node(target)
    return BaseRelationship(
        source=source, target=target, type=type.capitalize(), properties={}
    )


def load_and_split_documents(
    file_paths: List[str], chunk_size: int = 100, chunk_overlap: int = 20
):
    """
    Load and split multiple documents into chunks.

    Args:
        file_paths (List[str]): List of file paths to load.
        chunk_size (int): Size of each chunk (in tokens).
        chunk_overlap (int): Overlap between chunks (in tokens).

    Returns:
        List: List of split document chunks.
    """
    all_pages = []
    text_splitter = TokenTextSplitter(
        chunk_size=chunk_size, chunk_overlap=chunk_overlap
    )

    for file_path in file_paths:
        if file_path.endswith(".pdf"):
            loader = PyPDFLoader(file_path)
        else:
            loader = TextLoader(file_path)

        pages = loader.load_and_split()
        chunks = text_splitter.split_documents(pages)
        all_pages.extend(chunks)

    return all_pages