Spaces:
Runtime error
Runtime error
File size: 3,454 Bytes
4eb10d7 169c6ad 4eb10d7 169c6ad 4eb10d7 169c6ad 4eb10d7 169c6ad 4eb10d7 169c6ad 4eb10d7 7caef44 169c6ad 7caef44 169c6ad 7caef44 169c6ad 7caef44 169c6ad 7caef44 169c6ad 7caef44 169c6ad 4eb10d7 169c6ad 4eb10d7 169c6ad 4eb10d7 169c6ad 4eb10d7 169c6ad 7caef44 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 |
import os
import dotenv
import pathlib
from langchain_openai import AzureOpenAIEmbeddings
import pymupdf4llm
from qdrant_client import QdrantClient
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import MarkdownTextSplitter
from langchain_qdrant import QdrantVectorStore
from langchain.agents import tool
dotenv.load_dotenv()
qdrant_api_key = os.environ["QDRANT_API_KEY"]
# ---- GLOBAL DECLARATIONS ---- #
PDF_FOLDER_PATH = "data/reports/"
VECTORSTORE_LOCATION = os.environ["QDRANT_VECTORSTORE_LOCATION"]
VECTORSTORE_COLLECTION_NAME = os.environ['LANGCHAIN_PROJECT']
# -- RETRIEVAL -- #
# LOAD OpenAI EMBEDDINGS API object
embedding_model = AzureOpenAIEmbeddings(
azure_deployment=os.environ['AZURE_OPENAI_EMB_DEPLOYMENT'],
openai_api_version="2023-05-15",
)
docs_path = pathlib.Path(PDF_FOLDER_PATH)
qdrant_vectorstore = None
qdrant_client = QdrantClient(url=VECTORSTORE_LOCATION, api_key=qdrant_api_key)
collection_exists = qdrant_client.collection_exists(collection_name=VECTORSTORE_COLLECTION_NAME)
if not collection_exists:
print(f"Indexing Files into vectorstore {VECTORSTORE_COLLECTION_NAME}")
# Load docs
# convert the source PDF document to markdown, save it locally
source_documents = []
for file in docs_path.glob("*.pdf"):
md_text = pymupdf4llm.to_markdown(file)
md_path = file.with_suffix('.md')
md_path.write_bytes(md_text.encode())
text_loader = TextLoader(md_path)
loaded_doc = text_loader.load()[0]
loaded_doc.metadata['source'] = file.name
source_documents.append(loaded_doc)
# CREATE TEXT SPLITTER AND SPLIT DOCUMENTS
text_splitter = MarkdownTextSplitter( # RecursiveCharacterTextSplitter(
chunk_size=200,
chunk_overlap=20,
)
split_documents = text_splitter.split_documents(source_documents)
# INDEX FILES
qdrant_vectorstore = QdrantVectorStore.from_documents(
split_documents,
embedding = embedding_model,
location=VECTORSTORE_LOCATION,
collection_name=VECTORSTORE_COLLECTION_NAME,
prefer_grpc=True,
api_key=qdrant_api_key,
)
else:
# Load existing collection
qdrant_vectorstore = QdrantVectorStore.from_existing_collection(
embedding=embedding_model,
collection_name=VECTORSTORE_COLLECTION_NAME,
url=VECTORSTORE_LOCATION,
prefer_grpc=True,
api_key=qdrant_api_key,
)
# Create the retriever
# qdrant_retriever = qdrant_vectorstore.as_retriever(
# search_type='similarity_score_threshold',
# search_kwargs={'score_threshold': 0.5, 'k': 3}
# )
# # Create the tool
# pdf_retriever = create_retriever_tool(
# qdrant_retriever,
# "retrieve_pdfs",
# ,
# )
@tool
def pdf_retriever(user_query):
"""
Tool to search and return reports from existing reports database.
These reports are the preferred way of giving the user information about
the weather in Croatia, and how the weather affects solar panel electricity
production and usage.
"""
hits = qdrant_vectorstore.similarity_search_with_score(user_query, k=3, score_threshold=0.5)
#return hits
responses = []
for doc, score in hits:
response = {}
response['source_documents'] = doc.metadata['source']
response['contents'] = doc.page_content
responses.append(response)
return responses
|