|
from datetime import datetime |
|
from tqdm import tqdm |
|
from typing import List |
|
from utils import ( |
|
create_file_node, |
|
create_chunk_node, |
|
create_relationship, |
|
load_and_split_documents, |
|
) |
|
from subgraphs import extract_graph |
|
from langchain.schema import Document |
|
from langchain_community.graphs.graph_document import GraphDocument |
|
from utils import map_to_base_node |
|
from langchain_community.graphs import Neo4jGraph |
|
|
|
|
|
def build_graph(file_paths: List[str], graph: Neo4jGraph) -> GraphDocument: |
|
""" |
|
Main function to build a graph from a list of file paths. |
|
|
|
Args: |
|
file_paths: List of file paths to be processed |
|
|
|
Returns: |
|
GraphDocument: Graph document containing the extracted graph |
|
|
|
""" |
|
start_time = datetime.now() |
|
file_nodes = [] |
|
chunk_nodes = [] |
|
relationships = [] |
|
distinct_nodes = [] |
|
for file in file_paths: |
|
file_node = create_file_node(file) |
|
file_nodes.append(file_node) |
|
|
|
|
|
documents = load_and_split_documents([file]) |
|
|
|
for idx, doc in tqdm(enumerate(documents), total=len(documents)): |
|
chunk_node = create_chunk_node(doc, idx, file_node) |
|
chunk_nodes.append(chunk_node) |
|
relationships.append(create_relationship(chunk_node, file_node, "From")) |
|
|
|
graph_document = extract_graph(doc) |
|
|
|
for node in graph_document.nodes: |
|
if node.id not in distinct_nodes: |
|
distinct_nodes.append(node) |
|
relationships.append(create_relationship(node, chunk_node, "From")) |
|
|
|
for relation in graph_document.relationships: |
|
relationships.append(relation) |
|
|
|
print(f"Time taken: {datetime.now() - start_time}") |
|
|
|
final_graph_document = GraphDocument( |
|
nodes=distinct_nodes |
|
+ [map_to_base_node(node) for node in [*file_nodes, *chunk_nodes]], |
|
relationships=relationships, |
|
source=Document( |
|
page_content="Combined source of all files and chunks", |
|
metadata={"description": "Generated from multiple files and their chunks"}, |
|
), |
|
) |
|
|
|
|
|
|
|
return final_graph_document |
|
|