Spaces:

carvalhomb
/

reportwiz

Runtime error

File size: 3,454 Bytes

4eb10d7
 
 
 
 
 
 
 
169c6ad
4eb10d7
169c6ad
4eb10d7
 
 
 
 
 
 
 
 
 
 
169c6ad
4eb10d7
 
 
 
 
 
 
 
 
169c6ad
 
 
 
 
4eb10d7
 
169c6ad
4eb10d7
7caef44
169c6ad
 
7caef44
169c6ad
7caef44
169c6ad
 
 
7caef44
169c6ad
 
 
 
7caef44
 
169c6ad
 
 
 
 
 
 
 
 
 
 
 
7caef44
169c6ad
 
 
 
 
 
 
 
4eb10d7
 
169c6ad
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4eb10d7
 
169c6ad
4eb10d7
169c6ad
 
 
 
4eb10d7
169c6ad
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7caef44

import os
import dotenv
import pathlib

from langchain_openai import AzureOpenAIEmbeddings
import pymupdf4llm
from qdrant_client import QdrantClient

from langchain_community.document_loaders import TextLoader 
from langchain_text_splitters import MarkdownTextSplitter
from langchain_qdrant import QdrantVectorStore

from langchain.agents import tool

dotenv.load_dotenv()

qdrant_api_key = os.environ["QDRANT_API_KEY"]

# ---- GLOBAL DECLARATIONS ---- #

PDF_FOLDER_PATH = "data/reports/"
VECTORSTORE_LOCATION = os.environ["QDRANT_VECTORSTORE_LOCATION"]
VECTORSTORE_COLLECTION_NAME = os.environ['LANGCHAIN_PROJECT']

# -- RETRIEVAL -- #

# LOAD OpenAI EMBEDDINGS API object
embedding_model = AzureOpenAIEmbeddings(
    azure_deployment=os.environ['AZURE_OPENAI_EMB_DEPLOYMENT'],
    openai_api_version="2023-05-15",
)

docs_path = pathlib.Path(PDF_FOLDER_PATH)


qdrant_vectorstore = None

qdrant_client = QdrantClient(url=VECTORSTORE_LOCATION, api_key=qdrant_api_key)

collection_exists = qdrant_client.collection_exists(collection_name=VECTORSTORE_COLLECTION_NAME)


if not collection_exists:
    print(f"Indexing Files into vectorstore {VECTORSTORE_COLLECTION_NAME}")

    # Load docs

    # convert the source PDF document to markdown, save it locally
    source_documents = []
    for file in docs_path.glob("*.pdf"):

        md_text = pymupdf4llm.to_markdown(file)

        md_path = file.with_suffix('.md')
        md_path.write_bytes(md_text.encode())

        text_loader = TextLoader(md_path)
        loaded_doc = text_loader.load()[0]
        loaded_doc.metadata['source'] = file.name
        source_documents.append(loaded_doc)


    # CREATE TEXT SPLITTER AND SPLIT DOCUMENTS
    text_splitter = MarkdownTextSplitter(  # RecursiveCharacterTextSplitter(
        chunk_size=200,
        chunk_overlap=20,
    )

    split_documents = text_splitter.split_documents(source_documents)

    # INDEX FILES
    qdrant_vectorstore = QdrantVectorStore.from_documents(
        split_documents,
        embedding = embedding_model,
        location=VECTORSTORE_LOCATION,
        collection_name=VECTORSTORE_COLLECTION_NAME,
        prefer_grpc=True,
        api_key=qdrant_api_key,
    )

else:
    # Load existing collection
    qdrant_vectorstore = QdrantVectorStore.from_existing_collection(
        embedding=embedding_model,
        collection_name=VECTORSTORE_COLLECTION_NAME,
        url=VECTORSTORE_LOCATION,
        prefer_grpc=True,
        api_key=qdrant_api_key,
    )


# Create the retriever
# qdrant_retriever = qdrant_vectorstore.as_retriever(
#     search_type='similarity_score_threshold',
#     search_kwargs={'score_threshold': 0.5, 'k': 3}
# )

# # Create the tool
# pdf_retriever = create_retriever_tool(
#     qdrant_retriever,
#     "retrieve_pdfs",
#     ,
# )

@tool
def pdf_retriever(user_query):
    """
    Tool to search and return reports from existing reports database. 
    These reports are the preferred way of giving the user information about
    the weather in Croatia, and how the weather affects solar panel electricity 
    production and usage.
    """
    hits = qdrant_vectorstore.similarity_search_with_score(user_query, k=3, score_threshold=0.5)
    #return hits
    responses = []

    for doc, score in hits:
        response = {}

        response['source_documents'] = doc.metadata['source']
        response['contents'] = doc.page_content

        responses.append(response)

    return responses