Commit
·
0393e86
1
Parent(s):
3de8751
use logging
Browse files
kg_builder/src/graph_creation.py
CHANGED
@@ -1,9 +1,15 @@
|
|
|
|
|
|
1 |
from langchain_community.document_loaders import WikipediaLoader
|
2 |
from langchain.text_splitter import TokenTextSplitter
|
3 |
from knowledge_graph_builder import extract_and_store_graph
|
4 |
from dotenv import load_dotenv
|
5 |
from tqdm import tqdm
|
6 |
|
|
|
|
|
|
|
|
|
7 |
# Load environment variables
|
8 |
load_dotenv()
|
9 |
|
@@ -32,25 +38,25 @@ def build_graph_for_article(query, data_source_name):
|
|
32 |
chunk_size=400
|
33 |
chunk_overlap=10
|
34 |
|
35 |
-
|
36 |
raw_documents = WikipediaLoader(query=query, load_max_docs=load_max_documents).load()
|
37 |
if not raw_documents:
|
38 |
-
|
39 |
return
|
40 |
|
41 |
-
|
42 |
for doc in raw_documents:
|
43 |
-
|
44 |
#print(f"Document: {doc.page_content}")
|
45 |
|
46 |
-
|
47 |
text_splitter = TokenTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
|
48 |
chunkDocs = text_splitter.split_documents(raw_documents[:load_max_documents]) # Only process the first 5 documents
|
49 |
-
|
50 |
|
51 |
-
|
52 |
for i, chunkDoc in tqdm(enumerate(chunkDocs), total=len(chunkDocs)):
|
53 |
-
|
54 |
#print(f"Extract data from chunk {str(i)}: {chunkDoc.page_content}")
|
55 |
extract_and_store_graph(chunkDoc, data_source_name)
|
56 |
|
|
|
1 |
+
import logging
|
2 |
+
|
3 |
from langchain_community.document_loaders import WikipediaLoader
|
4 |
from langchain.text_splitter import TokenTextSplitter
|
5 |
from knowledge_graph_builder import extract_and_store_graph
|
6 |
from dotenv import load_dotenv
|
7 |
from tqdm import tqdm
|
8 |
|
9 |
+
logging.basicConfig(format='%(name)s - %(levelname)s - %(message)s', level=logging.INFO)
|
10 |
+
logger = logging.getLogger(__name__)
|
11 |
+
logger.setLevel(logging.INFO)
|
12 |
+
|
13 |
# Load environment variables
|
14 |
load_dotenv()
|
15 |
|
|
|
38 |
chunk_size=400
|
39 |
chunk_overlap=10
|
40 |
|
41 |
+
logger.info(f"Loading document(s) from Wikipedia using query '{query}' ...")
|
42 |
raw_documents = WikipediaLoader(query=query, load_max_docs=load_max_documents).load()
|
43 |
if not raw_documents:
|
44 |
+
logger.error(f"Failed to load content for query: {query}")
|
45 |
return
|
46 |
|
47 |
+
logger.info(f"{str(len(raw_documents))} document(s) loaded from Wikipedia.")
|
48 |
for doc in raw_documents:
|
49 |
+
logger.info(f"Document: {doc.metadata['source']}")
|
50 |
#print(f"Document: {doc.page_content}")
|
51 |
|
52 |
+
logger.info(f"Split document(s) into chunk(s) (Chunk size: {chunk_size}, Chunk overlap: {chunk_overlap}) ...")
|
53 |
text_splitter = TokenTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
|
54 |
chunkDocs = text_splitter.split_documents(raw_documents[:load_max_documents]) # Only process the first 5 documents
|
55 |
+
logger.info(f"{str(len(raw_documents))} document(s) split into {str(len(chunkDocs))} chunk(s)")
|
56 |
|
57 |
+
logger.info(f"Building the knowledge graph for document(s) found by query '{query}' ...")
|
58 |
for i, chunkDoc in tqdm(enumerate(chunkDocs), total=len(chunkDocs)):
|
59 |
+
logger.info(f"Extract data from chunk {str(i)} ...")
|
60 |
#print(f"Extract data from chunk {str(i)}: {chunkDoc.page_content}")
|
61 |
extract_and_store_graph(chunkDoc, data_source_name)
|
62 |
|