File size: 5,347 Bytes
0393e86
 
6feb084
b77d203
 
 
 
 
0393e86
 
 
 
b77d203
 
 
2582e69
4e9cef7
6feb084
 
 
 
f0336f4
 
b77d203
 
 
6feb084
 
 
 
 
 
b77d203
d264ba2
4e9cef7
 
 
d264ba2
4e9cef7
 
 
 
 
 
 
 
7206760
 
6feb084
 
7206760
 
 
 
b77d203
7206760
b77d203
 
6feb084
4e9cef7
0393e86
4e9cef7
 
0393e86
4e9cef7
 
0393e86
4e9cef7
0393e86
4e9cef7
0393e86
4e9cef7
d264ba2
b77d203
 
d264ba2
 
b77d203
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import logging

from langchain_community.document_loaders import WikipediaLoader, UnstructuredHTMLLoader
from langchain.text_splitter import TokenTextSplitter
from knowledge_graph_builder import extract_and_store_graph
from dotenv import load_dotenv
from tqdm import tqdm

logging.basicConfig(format='%(name)s - %(levelname)s - %(message)s', level=logging.INFO)
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

# Load environment variables
load_dotenv()

# IMPORTANT: Make sure data source names match with values inside api_connections.py
# Define articles / topics to load
articlesDISABLED = {
    "Chemotherapy": "Chemotherapy",
    "Traffic Law": "Traffic laws in the United States"
}
# Switzerland: https://www.fedlex.admin.ch/eli/cc/1962/1364_1409_1420/de
# Connecticut: https://en.wikipedia.org/wiki/Transportation_in_Connecticut#Rules_of_the_road
articles = {
    "Traffic Law": "Traffic laws in the United States"
}
articlesDISABLED = {
    "SquirroDocs": "https://docs.squirro.com/en/latest/technical/getting-started.html"
}
articlesDISABLED = {
    "SquirroDocs": "/Users/michaelwechner/Desktop/docs.squirro.com_en_latest_technical_getting-started.html"
}

def build_graph_for_article(query, data_source_name):
    """
    Build knowledge graph from loaded articles / documents of a particular topic
    :param query: The query string to search on Wikipedia, e.g. "Traffic laws in the United States"
    :param data_source_name: Data source name, e.g. "Traffic Law"
    :return:
    """
    load_max_documents = 5
    #chunk_size=4096
    #chunk_overlap=96
    chunk_size=400
    chunk_overlap=10

    if data_source_name == "SquirroDocs":
        logger.info(f"Loading document(s) from public website {query} ...")
        loader = UnstructuredHTMLLoader(query)
        raw_documents = loader.load()
    else:
        logger.info(f"Loading document(s) from Wikipedia using query '{query}' ...")
        raw_documents = WikipediaLoader(query=query, load_max_docs=load_max_documents).load()

    if not raw_documents:
        logger.error(f"Failed to load content for Data Source '{data_source_name}'!")
        return

    logger.info(f"{str(len(raw_documents))} document(s) loaded.")
    for doc in raw_documents:
        logger.info(f"Document: {doc.metadata['source']}")
        #print(f"Document: {doc.page_content}")

    logger.info(f"Split document(s) into chunk(s) (Chunk size: {chunk_size}, Chunk overlap: {chunk_overlap}) ...")
    text_splitter = TokenTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    chunkDocs = text_splitter.split_documents(raw_documents[:load_max_documents])  # Only process the first 5 documents
    logger.info(f"{str(len(raw_documents))} document(s) split into {str(len(chunkDocs))} chunk(s)")

    logger.info(f"Building the knowledge graph for document(s) found by query '{query}' ...")
    for i, chunkDoc in tqdm(enumerate(chunkDocs), total=len(chunkDocs)):
        logger.info(f"Extract data from chunk {str(i)} ...")
        #print(f"Extract data from chunk {str(i)}: {chunkDoc.page_content}")
        extract_and_store_graph(chunkDoc, data_source_name)

def main():
    for data_source_name, query in articles.items():
        build_graph_for_article(query, data_source_name)

if __name__ == "__main__":
    main()

# import os
# from openai import OpenAI
# from api_connections import get_graph_connection
# from knowledge_graph_builder import extract_and_store_graph
# from query_graph import query_knowledge_graph
# from langchain_community.document_loaders import WikipediaLoader
# from langchain.text_splitter import TokenTextSplitter
# from tqdm import tqdm

# def get_llm():
#     api_key = os.getenv("OPENAI_API_KEY")
#     if not api_key:
#         raise ValueError("No OpenAI API key found in environment variables.")
#     return OpenAI(api_key=api_key)

# def classify_query(query):
#     llm = get_llm()
#     response = llm.Completion.create(
#         model="text-davinci-003",  # Consider updating to the latest model as necessary
#         prompt=f"Classify the following query into 'Chemotherapy' or 'Traffic Law': {query}",
#         max_tokens=60
#     )
#     return response.choices[0].text.strip()

# def main():
#     print("Starting the script...")
#     # Take Wikipedia article name as input
#     article_name = input("Enter the Wikipedia article name: ")  

#     print(f"Loading documents for: {article_name}")
#     # Load and process the Wikipedia article
#     raw_documents = WikipediaLoader(query=article_name).load()
#     text_splitter = TokenTextSplitter(chunk_size=4096, chunk_overlap=96)
#     documents = text_splitter.split_documents(raw_documents[:5])  # Only process the first 5 documents

#     print("Building the knowledge graph...")
#     # Build the knowledge graph from the documents
#     for i, d in tqdm(enumerate(documents), total=len(documents)):
#         extract_and_store_graph(d)

#     print("Graph construction complete. Please enter your query.")
#     # Take a query related to the graph
#     user_query = input("Enter your query related to the graph: ")

#     print(f"Querying the graph with: {user_query}")
#     # Query the graph and print the answer
#     answer = query_knowledge_graph(user_query)
#     print("Answer to your query:", answer)

# if __name__ == "__main__":
#     main()