File size: 2,322 Bytes
f7ab812
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
from datetime import datetime
from tqdm import tqdm
from typing import List
from utils import (
    create_file_node,
    create_chunk_node,
    create_relationship,
    load_and_split_documents,
)
from subgraphs import extract_graph
from langchain.schema import Document
from langchain_community.graphs.graph_document import GraphDocument
from utils import map_to_base_node
from langchain_community.graphs import Neo4jGraph


def build_graph(file_paths: List[str], graph: Neo4jGraph) -> GraphDocument:
    """
    Main function to build a graph from a list of file paths.

    Args:
    file_paths: List of file paths to be processed

    Returns:
    GraphDocument: Graph document containing the extracted graph

    """
    start_time = datetime.now()
    file_nodes = []
    chunk_nodes = []
    relationships = []
    distinct_nodes = []
    for file in file_paths:
        file_node = create_file_node(file)
        file_nodes.append(file_node)

        # Load and split documents
        documents = load_and_split_documents([file])

        for idx, doc in tqdm(enumerate(documents), total=len(documents)):
            chunk_node = create_chunk_node(doc, idx, file_node)
            chunk_nodes.append(chunk_node)
            relationships.append(create_relationship(chunk_node, file_node, "From"))

            graph_document = extract_graph(doc)
            # Get distinct nodes
            for node in graph_document.nodes:
                if node.id not in distinct_nodes:
                    distinct_nodes.append(node)
                relationships.append(create_relationship(node, chunk_node, "From"))
            # Get all relations
            for relation in graph_document.relationships:
                relationships.append(relation)

    print(f"Time taken: {datetime.now() - start_time}")

    final_graph_document = GraphDocument(
        nodes=distinct_nodes
        + [map_to_base_node(node) for node in [*file_nodes, *chunk_nodes]],
        relationships=relationships,
        source=Document(
            page_content="Combined source of all files and chunks",
            metadata={"description": "Generated from multiple files and their chunks"},
        ),
    )

    # Add the graph document to the graph
    # graph.add_graph_document(final_graph_document)
    return final_graph_document