File size: 5,347 Bytes
0393e86 6feb084 b77d203 0393e86 b77d203 2582e69 4e9cef7 6feb084 f0336f4 b77d203 6feb084 b77d203 d264ba2 4e9cef7 d264ba2 4e9cef7 7206760 6feb084 7206760 b77d203 7206760 b77d203 6feb084 4e9cef7 0393e86 4e9cef7 0393e86 4e9cef7 0393e86 4e9cef7 0393e86 4e9cef7 0393e86 4e9cef7 d264ba2 b77d203 d264ba2 b77d203 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 |
import logging
from langchain_community.document_loaders import WikipediaLoader, UnstructuredHTMLLoader
from langchain.text_splitter import TokenTextSplitter
from knowledge_graph_builder import extract_and_store_graph
from dotenv import load_dotenv
from tqdm import tqdm
logging.basicConfig(format='%(name)s - %(levelname)s - %(message)s', level=logging.INFO)
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
# Load environment variables
load_dotenv()
# IMPORTANT: Make sure data source names match with values inside api_connections.py
# Define articles / topics to load
articlesDISABLED = {
"Chemotherapy": "Chemotherapy",
"Traffic Law": "Traffic laws in the United States"
}
# Switzerland: https://www.fedlex.admin.ch/eli/cc/1962/1364_1409_1420/de
# Connecticut: https://en.wikipedia.org/wiki/Transportation_in_Connecticut#Rules_of_the_road
articles = {
"Traffic Law": "Traffic laws in the United States"
}
articlesDISABLED = {
"SquirroDocs": "https://docs.squirro.com/en/latest/technical/getting-started.html"
}
articlesDISABLED = {
"SquirroDocs": "/Users/michaelwechner/Desktop/docs.squirro.com_en_latest_technical_getting-started.html"
}
def build_graph_for_article(query, data_source_name):
"""
Build knowledge graph from loaded articles / documents of a particular topic
:param query: The query string to search on Wikipedia, e.g. "Traffic laws in the United States"
:param data_source_name: Data source name, e.g. "Traffic Law"
:return:
"""
load_max_documents = 5
#chunk_size=4096
#chunk_overlap=96
chunk_size=400
chunk_overlap=10
if data_source_name == "SquirroDocs":
logger.info(f"Loading document(s) from public website {query} ...")
loader = UnstructuredHTMLLoader(query)
raw_documents = loader.load()
else:
logger.info(f"Loading document(s) from Wikipedia using query '{query}' ...")
raw_documents = WikipediaLoader(query=query, load_max_docs=load_max_documents).load()
if not raw_documents:
logger.error(f"Failed to load content for Data Source '{data_source_name}'!")
return
logger.info(f"{str(len(raw_documents))} document(s) loaded.")
for doc in raw_documents:
logger.info(f"Document: {doc.metadata['source']}")
#print(f"Document: {doc.page_content}")
logger.info(f"Split document(s) into chunk(s) (Chunk size: {chunk_size}, Chunk overlap: {chunk_overlap}) ...")
text_splitter = TokenTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
chunkDocs = text_splitter.split_documents(raw_documents[:load_max_documents]) # Only process the first 5 documents
logger.info(f"{str(len(raw_documents))} document(s) split into {str(len(chunkDocs))} chunk(s)")
logger.info(f"Building the knowledge graph for document(s) found by query '{query}' ...")
for i, chunkDoc in tqdm(enumerate(chunkDocs), total=len(chunkDocs)):
logger.info(f"Extract data from chunk {str(i)} ...")
#print(f"Extract data from chunk {str(i)}: {chunkDoc.page_content}")
extract_and_store_graph(chunkDoc, data_source_name)
def main():
for data_source_name, query in articles.items():
build_graph_for_article(query, data_source_name)
if __name__ == "__main__":
main()
# import os
# from openai import OpenAI
# from api_connections import get_graph_connection
# from knowledge_graph_builder import extract_and_store_graph
# from query_graph import query_knowledge_graph
# from langchain_community.document_loaders import WikipediaLoader
# from langchain.text_splitter import TokenTextSplitter
# from tqdm import tqdm
# def get_llm():
# api_key = os.getenv("OPENAI_API_KEY")
# if not api_key:
# raise ValueError("No OpenAI API key found in environment variables.")
# return OpenAI(api_key=api_key)
# def classify_query(query):
# llm = get_llm()
# response = llm.Completion.create(
# model="text-davinci-003", # Consider updating to the latest model as necessary
# prompt=f"Classify the following query into 'Chemotherapy' or 'Traffic Law': {query}",
# max_tokens=60
# )
# return response.choices[0].text.strip()
# def main():
# print("Starting the script...")
# # Take Wikipedia article name as input
# article_name = input("Enter the Wikipedia article name: ")
# print(f"Loading documents for: {article_name}")
# # Load and process the Wikipedia article
# raw_documents = WikipediaLoader(query=article_name).load()
# text_splitter = TokenTextSplitter(chunk_size=4096, chunk_overlap=96)
# documents = text_splitter.split_documents(raw_documents[:5]) # Only process the first 5 documents
# print("Building the knowledge graph...")
# # Build the knowledge graph from the documents
# for i, d in tqdm(enumerate(documents), total=len(documents)):
# extract_and_store_graph(d)
# print("Graph construction complete. Please enter your query.")
# # Take a query related to the graph
# user_query = input("Enter your query related to the graph: ")
# print(f"Querying the graph with: {user_query}")
# # Query the graph and print the answer
# answer = query_knowledge_graph(user_query)
# print("Answer to your query:", answer)
# if __name__ == "__main__":
# main()
|