File size: 3,454 Bytes
4eb10d7
 
 
 
 
 
 
 
169c6ad
4eb10d7
169c6ad
4eb10d7
 
 
 
 
 
 
 
 
 
 
169c6ad
4eb10d7
 
 
 
 
 
 
 
 
169c6ad
 
 
 
 
4eb10d7
 
169c6ad
4eb10d7
7caef44
169c6ad
 
7caef44
169c6ad
7caef44
169c6ad
 
 
7caef44
169c6ad
 
 
 
7caef44
 
169c6ad
 
 
 
 
 
 
 
 
 
 
 
7caef44
169c6ad
 
 
 
 
 
 
 
4eb10d7
 
169c6ad
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4eb10d7
 
169c6ad
4eb10d7
169c6ad
 
 
 
4eb10d7
169c6ad
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7caef44
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import os
import dotenv
import pathlib

from langchain_openai import AzureOpenAIEmbeddings
import pymupdf4llm
from qdrant_client import QdrantClient

from langchain_community.document_loaders import TextLoader 
from langchain_text_splitters import MarkdownTextSplitter
from langchain_qdrant import QdrantVectorStore

from langchain.agents import tool

dotenv.load_dotenv()

qdrant_api_key = os.environ["QDRANT_API_KEY"]

# ---- GLOBAL DECLARATIONS ---- #

PDF_FOLDER_PATH = "data/reports/"
VECTORSTORE_LOCATION = os.environ["QDRANT_VECTORSTORE_LOCATION"]
VECTORSTORE_COLLECTION_NAME = os.environ['LANGCHAIN_PROJECT']

# -- RETRIEVAL -- #

# LOAD OpenAI EMBEDDINGS API object
embedding_model = AzureOpenAIEmbeddings(
    azure_deployment=os.environ['AZURE_OPENAI_EMB_DEPLOYMENT'],
    openai_api_version="2023-05-15",
)

docs_path = pathlib.Path(PDF_FOLDER_PATH)


qdrant_vectorstore = None

qdrant_client = QdrantClient(url=VECTORSTORE_LOCATION, api_key=qdrant_api_key)

collection_exists = qdrant_client.collection_exists(collection_name=VECTORSTORE_COLLECTION_NAME)


if not collection_exists:
    print(f"Indexing Files into vectorstore {VECTORSTORE_COLLECTION_NAME}")

    # Load docs

    # convert the source PDF document to markdown, save it locally
    source_documents = []
    for file in docs_path.glob("*.pdf"):

        md_text = pymupdf4llm.to_markdown(file)

        md_path = file.with_suffix('.md')
        md_path.write_bytes(md_text.encode())

        text_loader = TextLoader(md_path)
        loaded_doc = text_loader.load()[0]
        loaded_doc.metadata['source'] = file.name
        source_documents.append(loaded_doc)


    # CREATE TEXT SPLITTER AND SPLIT DOCUMENTS
    text_splitter = MarkdownTextSplitter(  # RecursiveCharacterTextSplitter(
        chunk_size=200,
        chunk_overlap=20,
    )

    split_documents = text_splitter.split_documents(source_documents)

    # INDEX FILES
    qdrant_vectorstore = QdrantVectorStore.from_documents(
        split_documents,
        embedding = embedding_model,
        location=VECTORSTORE_LOCATION,
        collection_name=VECTORSTORE_COLLECTION_NAME,
        prefer_grpc=True,
        api_key=qdrant_api_key,
    )

else:
    # Load existing collection
    qdrant_vectorstore = QdrantVectorStore.from_existing_collection(
        embedding=embedding_model,
        collection_name=VECTORSTORE_COLLECTION_NAME,
        url=VECTORSTORE_LOCATION,
        prefer_grpc=True,
        api_key=qdrant_api_key,
    )


# Create the retriever
# qdrant_retriever = qdrant_vectorstore.as_retriever(
#     search_type='similarity_score_threshold',
#     search_kwargs={'score_threshold': 0.5, 'k': 3}
# )

# # Create the tool
# pdf_retriever = create_retriever_tool(
#     qdrant_retriever,
#     "retrieve_pdfs",
#     ,
# )

@tool
def pdf_retriever(user_query):
    """
    Tool to search and return reports from existing reports database. 
    These reports are the preferred way of giving the user information about
    the weather in Croatia, and how the weather affects solar panel electricity 
    production and usage.
    """
    hits = qdrant_vectorstore.similarity_search_with_score(user_query, k=3, score_threshold=0.5)
    #return hits
    responses = []

    for doc, score in hits:
        response = {}

        response['source_documents'] = doc.metadata['source']
        response['contents'] = doc.page_content

        responses.append(response)

    return responses