reportwiz / pdf_retriever.py
carvalhomb's picture
Deploying v1.0-rc1
01a8f5e
import os
import dotenv
import pathlib
from langchain_openai import AzureOpenAIEmbeddings
import pymupdf4llm
from qdrant_client import QdrantClient
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import MarkdownTextSplitter
from langchain_qdrant import QdrantVectorStore
from langchain.agents import tool
dotenv.load_dotenv()
qdrant_api_key = os.environ["QDRANT_API_KEY"]
# ---- GLOBAL DECLARATIONS ---- #
PDF_FOLDER_PATH = "data/reports/"
VECTORSTORE_LOCATION = os.environ["QDRANT_VECTORSTORE_LOCATION"]
VECTORSTORE_COLLECTION_NAME = os.environ['LANGCHAIN_PROJECT']
# -- RETRIEVAL -- #
# LOAD OpenAI EMBEDDINGS API object
embedding_model = AzureOpenAIEmbeddings(
azure_deployment=os.environ['AZURE_OPENAI_EMB_DEPLOYMENT'],
openai_api_version="2023-05-15",
)
docs_path = pathlib.Path(PDF_FOLDER_PATH)
qdrant_vectorstore = None
qdrant_client = QdrantClient(url=VECTORSTORE_LOCATION, api_key=qdrant_api_key)
collection_exists = qdrant_client.collection_exists(collection_name=VECTORSTORE_COLLECTION_NAME)
if not collection_exists:
print(f"Indexing Files into vectorstore {VECTORSTORE_COLLECTION_NAME}")
# Load docs
# convert the source PDF document to markdown, save it locally
source_documents = []
for file in docs_path.glob("*.pdf"):
md_text = pymupdf4llm.to_markdown(file)
md_path = file.with_suffix('.md')
md_path.write_bytes(md_text.encode())
text_loader = TextLoader(md_path)
loaded_doc = text_loader.load()[0]
loaded_doc.metadata['source'] = file.name
source_documents.append(loaded_doc)
# CREATE TEXT SPLITTER AND SPLIT DOCUMENTS
text_splitter = MarkdownTextSplitter( # RecursiveCharacterTextSplitter(
chunk_size=200,
chunk_overlap=20,
)
split_documents = text_splitter.split_documents(source_documents)
# INDEX FILES
qdrant_vectorstore = QdrantVectorStore.from_documents(
split_documents,
embedding = embedding_model,
location=VECTORSTORE_LOCATION,
collection_name=VECTORSTORE_COLLECTION_NAME,
prefer_grpc=True,
api_key=qdrant_api_key,
)
else:
# Load existing collection
qdrant_vectorstore = QdrantVectorStore.from_existing_collection(
embedding=embedding_model,
collection_name=VECTORSTORE_COLLECTION_NAME,
url=VECTORSTORE_LOCATION,
prefer_grpc=True,
api_key=qdrant_api_key,
)
# Create the retriever
# qdrant_retriever = qdrant_vectorstore.as_retriever(
# search_type='similarity_score_threshold',
# search_kwargs={'score_threshold': 0.5, 'k': 3}
# )
# # Create the tool
# pdf_retriever = create_retriever_tool(
# qdrant_retriever,
# "retrieve_pdfs",
# ,
# )
@tool
def pdf_retriever(user_query):
"""
Tool to search and return reports from existing reports database.
These reports are the preferred way of giving the user information about
the weather in Croatia, and how the weather affects solar panel electricity
production and usage.
"""
hits = qdrant_vectorstore.similarity_search_with_score(user_query, k=3, score_threshold=0.5)
#return hits
responses = []
for doc, score in hits:
response = {}
response['source_documents'] = doc.metadata['source']
response['contents'] = doc.page_content
responses.append(response)
return responses