michaelwechner commited on
Commit
0393e86
·
1 Parent(s): 3de8751

use logging

Browse files
Files changed (1) hide show
  1. kg_builder/src/graph_creation.py +14 -8
kg_builder/src/graph_creation.py CHANGED
@@ -1,9 +1,15 @@
 
 
1
  from langchain_community.document_loaders import WikipediaLoader
2
  from langchain.text_splitter import TokenTextSplitter
3
  from knowledge_graph_builder import extract_and_store_graph
4
  from dotenv import load_dotenv
5
  from tqdm import tqdm
6
 
 
 
 
 
7
  # Load environment variables
8
  load_dotenv()
9
 
@@ -32,25 +38,25 @@ def build_graph_for_article(query, data_source_name):
32
  chunk_size=400
33
  chunk_overlap=10
34
 
35
- print(f"Loading document(s) from Wikipedia using query '{query}' ...")
36
  raw_documents = WikipediaLoader(query=query, load_max_docs=load_max_documents).load()
37
  if not raw_documents:
38
- print(f"Failed to load content for query: {query}")
39
  return
40
 
41
- print(f"{str(len(raw_documents))} document(s) loaded from Wikipedia.")
42
  for doc in raw_documents:
43
- print(f"Document: {doc.metadata['source']}")
44
  #print(f"Document: {doc.page_content}")
45
 
46
- print(f"Split document(s) into chunk(s) (Chunk size: {chunk_size}, Chunk overlap: {chunk_overlap}) ...")
47
  text_splitter = TokenTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
48
  chunkDocs = text_splitter.split_documents(raw_documents[:load_max_documents]) # Only process the first 5 documents
49
- print(f"{str(len(raw_documents))} document(s) split into {str(len(chunkDocs))} chunk(s)")
50
 
51
- print(f"Building the knowledge graph for document(s) found by query '{query}' ...")
52
  for i, chunkDoc in tqdm(enumerate(chunkDocs), total=len(chunkDocs)):
53
- print(f"Extract data from chunk {str(i)} ...")
54
  #print(f"Extract data from chunk {str(i)}: {chunkDoc.page_content}")
55
  extract_and_store_graph(chunkDoc, data_source_name)
56
 
 
1
+ import logging
2
+
3
  from langchain_community.document_loaders import WikipediaLoader
4
  from langchain.text_splitter import TokenTextSplitter
5
  from knowledge_graph_builder import extract_and_store_graph
6
  from dotenv import load_dotenv
7
  from tqdm import tqdm
8
 
9
+ logging.basicConfig(format='%(name)s - %(levelname)s - %(message)s', level=logging.INFO)
10
+ logger = logging.getLogger(__name__)
11
+ logger.setLevel(logging.INFO)
12
+
13
  # Load environment variables
14
  load_dotenv()
15
 
 
38
  chunk_size=400
39
  chunk_overlap=10
40
 
41
+ logger.info(f"Loading document(s) from Wikipedia using query '{query}' ...")
42
  raw_documents = WikipediaLoader(query=query, load_max_docs=load_max_documents).load()
43
  if not raw_documents:
44
+ logger.error(f"Failed to load content for query: {query}")
45
  return
46
 
47
+ logger.info(f"{str(len(raw_documents))} document(s) loaded from Wikipedia.")
48
  for doc in raw_documents:
49
+ logger.info(f"Document: {doc.metadata['source']}")
50
  #print(f"Document: {doc.page_content}")
51
 
52
+ logger.info(f"Split document(s) into chunk(s) (Chunk size: {chunk_size}, Chunk overlap: {chunk_overlap}) ...")
53
  text_splitter = TokenTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
54
  chunkDocs = text_splitter.split_documents(raw_documents[:load_max_documents]) # Only process the first 5 documents
55
+ logger.info(f"{str(len(raw_documents))} document(s) split into {str(len(chunkDocs))} chunk(s)")
56
 
57
+ logger.info(f"Building the knowledge graph for document(s) found by query '{query}' ...")
58
  for i, chunkDoc in tqdm(enumerate(chunkDocs), total=len(chunkDocs)):
59
+ logger.info(f"Extract data from chunk {str(i)} ...")
60
  #print(f"Extract data from chunk {str(i)}: {chunkDoc.page_content}")
61
  extract_and_store_graph(chunkDoc, data_source_name)
62