Spaces:
Sleeping
Sleeping
Maurizio Dipierro
commited on
Commit
·
6c94128
1
Parent(s):
1a2a324
working cmd
Browse files- document_handler.py +22 -0
- main.py +38 -0
- query_executor.py +43 -0
- vectorstore_handler.py +27 -0
document_handler.py
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import pickle
|
3 |
+
from langchain_community.document_loaders.sitemap import SitemapLoader
|
4 |
+
|
5 |
+
docs_file_path = 'sitemap_docs.pkl'
|
6 |
+
|
7 |
+
def save_documents_to_disk(docs, file_path):
|
8 |
+
"""Save the documents to a file using pickle."""
|
9 |
+
with open(file_path, 'wb') as file:
|
10 |
+
pickle.dump(docs, file)
|
11 |
+
|
12 |
+
def load_documents_from_disk(file_path):
|
13 |
+
"""Load the documents from a file if it exists."""
|
14 |
+
if os.path.exists(file_path):
|
15 |
+
with open(file_path, 'rb') as file:
|
16 |
+
return pickle.load(file)
|
17 |
+
return None
|
18 |
+
|
19 |
+
def load_documents_from_sitemap(sitemap_url):
|
20 |
+
"""Load documents from a sitemap URL using SitemapLoader."""
|
21 |
+
sitemap_loader = SitemapLoader(web_path=sitemap_url)
|
22 |
+
return sitemap_loader.load()
|
main.py
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
from document_handler import load_documents_from_disk, load_documents_from_sitemap, save_documents_to_disk
|
3 |
+
from vectorstore_handler import load_or_create_vectorstore, get_embeddings
|
4 |
+
from query_executor import execute_query
|
5 |
+
|
6 |
+
# Argument parsing for command line
|
7 |
+
def main():
|
8 |
+
parser = argparse.ArgumentParser(description="Process a query to the Chroma vectorstore.")
|
9 |
+
parser.add_argument('query', type=str, help='The query to search in the vector store')
|
10 |
+
args = parser.parse_args()
|
11 |
+
|
12 |
+
# Path to save the documents
|
13 |
+
sitemap_url = "https://www.originws.it/page-sitemap.xml"
|
14 |
+
docs_file_path = 'sitemap_docs.pkl'
|
15 |
+
|
16 |
+
# Try to load documents from disk
|
17 |
+
docs = load_documents_from_disk(docs_file_path)
|
18 |
+
|
19 |
+
if docs is None:
|
20 |
+
print("Documents not found on disk, loading from sitemap...")
|
21 |
+
# Load documents using SitemapLoader
|
22 |
+
docs = load_documents_from_sitemap(sitemap_url)
|
23 |
+
save_documents_to_disk(docs, docs_file_path)
|
24 |
+
print("Documents saved to disk.")
|
25 |
+
else:
|
26 |
+
print("Documents loaded from disk.")
|
27 |
+
|
28 |
+
# Get embeddings and load/create the vectorstore
|
29 |
+
embeddings = get_embeddings()
|
30 |
+
vectorstore = load_or_create_vectorstore(docs, embeddings)
|
31 |
+
|
32 |
+
# Now that the vectorstore is ready, let's query it
|
33 |
+
question = args.query
|
34 |
+
response = execute_query(question, vectorstore)
|
35 |
+
print(response)
|
36 |
+
|
37 |
+
if __name__ == "__main__":
|
38 |
+
main()
|
query_executor.py
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain_openai import ChatOpenAI
|
2 |
+
from langchain_core.output_parsers import StrOutputParser
|
3 |
+
from langchain_core.prompts import ChatPromptTemplate
|
4 |
+
from langchain_core.runnables import RunnablePassthrough
|
5 |
+
|
6 |
+
RAG_TEMPLATE = """
|
7 |
+
You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.
|
8 |
+
|
9 |
+
<context>
|
10 |
+
{context}
|
11 |
+
</context>
|
12 |
+
|
13 |
+
Answer the following question:
|
14 |
+
|
15 |
+
{question}"""
|
16 |
+
|
17 |
+
def format_docs(docs):
|
18 |
+
"""Format documents into a single string."""
|
19 |
+
return "\n\n".join(doc.page_content for doc in docs)
|
20 |
+
|
21 |
+
def execute_query(question, vectorstore):
|
22 |
+
"""Run the query against the vectorstore and return a response."""
|
23 |
+
print(f"Searching for: {question}")
|
24 |
+
docs = vectorstore.similarity_search(question, k=10)
|
25 |
+
print(f"Found {len(docs)} relevant documents for the query.")
|
26 |
+
|
27 |
+
# Set up the LLM and prompt handling
|
28 |
+
llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)
|
29 |
+
|
30 |
+
# Define the RAG prompt template
|
31 |
+
rag_prompt = ChatPromptTemplate.from_template(RAG_TEMPLATE)
|
32 |
+
|
33 |
+
# Create the chain
|
34 |
+
chain = (
|
35 |
+
RunnablePassthrough.assign(context=lambda input: format_docs(input["context"]))
|
36 |
+
| rag_prompt
|
37 |
+
| llm
|
38 |
+
| StrOutputParser()
|
39 |
+
)
|
40 |
+
|
41 |
+
# Run the chain with the query
|
42 |
+
response = chain.invoke({"context": docs, "question": question})
|
43 |
+
return response
|
vectorstore_handler.py
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
3 |
+
from langchain_openai import OpenAIEmbeddings
|
4 |
+
from langchain_chroma import Chroma
|
5 |
+
|
6 |
+
chroma_db_dir = 'chroma_vectorstore'
|
7 |
+
|
8 |
+
def get_embeddings():
|
9 |
+
"""Initialize and return OpenAI embeddings."""
|
10 |
+
return OpenAIEmbeddings(model="text-embedding-3-large")
|
11 |
+
|
12 |
+
def load_or_create_vectorstore(docs, embeddings):
|
13 |
+
"""Load or create a Chroma vectorstore."""
|
14 |
+
if os.path.exists(chroma_db_dir):
|
15 |
+
print("Loading existing Chroma vector store from disk...")
|
16 |
+
return Chroma(persist_directory=chroma_db_dir, embedding_function=embeddings)
|
17 |
+
|
18 |
+
# Split documents if vectorstore doesn't exist
|
19 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
|
20 |
+
all_splits = text_splitter.split_documents(docs)
|
21 |
+
print(f"Documents are split into {len(all_splits)} chunks from {len(docs)} documents.")
|
22 |
+
|
23 |
+
# Create new vectorstore
|
24 |
+
print("Creating new Chroma vector store...")
|
25 |
+
vectorstore = Chroma.from_documents(documents=all_splits, embedding=embeddings, persist_directory=chroma_db_dir)
|
26 |
+
print(f"Vectorstore created and saved to {chroma_db_dir}")
|
27 |
+
return vectorstore
|