File size: 8,640 Bytes
ed91833
 
1ef298a
 
ed91833
999f24c
ed91833
 
 
999f24c
654e910
45884d3
654e910
 
ed91833
1ef298a
 
4d17f84
999f24c
45884d3
 
 
 
 
 
 
ed91833
999f24c
 
ed91833
4d17f84
45884d3
 
 
999f24c
 
 
4d17f84
1ef298a
45884d3
654e910
 
4d17f84
1ef298a
999f24c
654e910
45884d3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b22f9a0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45884d3
b22f9a0
45884d3
 
 
 
 
 
 
 
 
 
 
 
b22f9a0
45884d3
 
 
 
 
 
 
b22f9a0
7d1e4e7
b22f9a0
 
 
 
 
 
7d1e4e7
 
b22f9a0
 
 
 
 
 
 
45884d3
 
 
654e910
45884d3
654e910
 
999f24c
654e910
b22f9a0
999f24c
654e910
45884d3
 
 
999f24c
895b645
654e910
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1ef298a
 
 
 
 
999f24c
1ef298a
654e910
 
1ef298a
b22f9a0
1ef298a
654e910
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45884d3
654e910
1ef298a
654e910
 
 
 
 
 
 
 
 
 
 
 
 
1ef298a
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
"""
Super early version of a vector store. Just want to make something available for the rest of the app to use.

Vector store implementation with singleton pattern to ensure only one instance exists.
"""

import os
import requests
import nltk
import logging
import requests

from typing import Optional, List, Union
from langchain_qdrant import QdrantVectorStore
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_community.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance
from langchain.schema import Document
from .vectorstore_helpers import (
    get_document_hash_as_uuid,
    enrich_document_metadata,
    check_collection_exists,
)

nltk.download("punkt_tab")
nltk.download("averaged_perceptron_tagger_eng")

DEFAULT_EMBEDDING_MODEL_ID = "text-embedding-3-small"
DEFAULT_VECTOR_DIMENSIONS = 1536
DEFAULT_VECTOR_DISTANCE = Distance.COSINE
PROBLEMS_REFERENCE_COLLECTION_NAME = "problems_reference_collection"
LOCAL_QDRANT_PATH = "/data/qdrant_db"

logger = logging.getLogger(__name__)

# Global variable to store the singleton instance
_qdrant_client_instance: Optional[QdrantClient] = None
_vector_db_instance: Optional[QdrantVectorStore] = None
_embedding_model: Optional[Union[OpenAIEmbeddings, HuggingFaceEmbeddings]] = None
_embedding_model_id: str = None


def _initialize_vector_db():
    os.makedirs("static/data", exist_ok=True)

    html_path = "static/data/langchain_rag_tutorial.html"
    if not os.path.exists(html_path):
        url = "https://python.langchain.com/docs/tutorials/rag/"
        response = requests.get(url)
        with open(html_path, "w", encoding="utf-8") as f:
            f.write(response.text)

    loader = DirectoryLoader("static/data", glob="*.html")
    documents = loader.load()

    enriched_docs = [
        enrich_document_metadata(
            doc,
            title="LangChain RAG Tutorial",
            type="tutorial",
            source_url="https://python.langchain.com/docs/tutorials/rag/",
            description="Official LangChain tutorial on building RAG applications",
            date_added="2024-03-21",
            category="documentation",
            version="1.0",
            language="en",
        )
        for doc in documents
    ]

    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    split_chunks = text_splitter.split_documents(enriched_docs)

    store_documents(
        split_chunks,
        PROBLEMS_REFERENCE_COLLECTION_NAME,
    )


def get_qdrant_client():
    global _qdrant_client_instance

    if _qdrant_client_instance is None:
        if (
            os.environ.get("QDRANT_URL") is None
            or os.environ.get("QDRANT_API_KEY") is None
        ):
            logger.warning(
                "QDRANT_URL or QDRANT_API_KEY is not set. Defaulting to local memory vector store."
            )

            os.makedirs(LOCAL_QDRANT_PATH, exist_ok=True)
            _qdrant_client_instance = QdrantClient(path=LOCAL_QDRANT_PATH)
            # _qdrant_client_instance = QdrantClient(":memory:")
            return _qdrant_client_instance

        logger.info(
            f"Attempting to connect to Qdrant at {os.environ.get("QDRANT_URL")}"
        )
        try:
            _qdrant_client_instance = QdrantClient(
                url=os.environ.get("QDRANT_URL"),
                api_key=os.environ.get("QDRANT_API_KEY"),
            )
            logger.info("Successfully connected to Qdrant Cloud")
        except Exception as e:
            logger.error(f"Failed to connect to Qdrant Cloud: {str(e)}")
            raise e
    return _qdrant_client_instance


def get_all_unique_source_of_docs_in_collection(
    collection_name: str = PROBLEMS_REFERENCE_COLLECTION_NAME,
    limit: int = 1000,
    offset: int = 0,
) -> List[Document]:
    response = get_qdrant_client().scroll(
        collection_name=collection_name,
        limit=limit,
        offset=offset,
        with_payload=["source"],
        with_vectors=False,
    )
    result = set()
    while len(response[0]) > 0:
        for point in response[0]:
            if "source" in point.payload:
                result.add(point.payload["source"])
        offset = response[1]
        response = get_qdrant_client().scroll(
            collection_name=collection_name,
            limit=limit,
            offset=offset + limit,
        )
    return list(result)


# TODO This is a dumb hack to get around Qdrant client restrictions when using local file storage.
# Instead of using the client directly, we use QdrantVectorStore's similarity search
# with a dummy query to get all documents, then extract unique sources.
def get_all_unique_source_of_docs_in_collection_DUMB(
    collection_name: str = PROBLEMS_REFERENCE_COLLECTION_NAME,
) -> List[str]:
    vector_store = get_vector_db()
    # Use a very generic query that should match everything
    docs = vector_store.similarity_search("", k=1000)

    sources = set()
    for doc in docs:
        if doc.metadata and "title" in doc.metadata:
            sources.add(doc.metadata["title"])
    return list(sources)


def store_documents(
    documents: List[Document],
    collection_name: str,
    embedding_model_id: str = None,
):
    global _vector_db_instance
    assert _vector_db_instance is not None, "Vector database instance not initialized"

    embedding_model = get_embedding_model(embedding_model_id)
    client = get_qdrant_client()

    _vector_db_instance.add_documents(
        documents=documents,
        ids=[get_document_hash_as_uuid(doc) for doc in documents],
    )


def get_embedding_model(embedding_model_id: str = None):
    """
    Factory function that returns a singleton instance of the embedding model.
    Creates the instance if it doesn't exist.
    """
    global _embedding_model, _embedding_model_id

    if _embedding_model is None or embedding_model_id != _embedding_model_id:
        if embedding_model_id is None:
            _embedding_model = OpenAIEmbeddings(model=DEFAULT_EMBEDDING_MODEL_ID)
        else:
            _embedding_model = HuggingFaceEmbeddings(model_name=embedding_model_id)
        _embedding_model_id = embedding_model_id

    return _embedding_model


def get_vector_db(embedding_model_id: str = None) -> QdrantVectorStore:
    """
    Factory function that returns a singleton instance of the vector database.
    Creates the instance if it doesn't exist.
    """
    global _vector_db_instance

    if _vector_db_instance is None:
        need_to_initialize_db = False
        embedding_model = get_embedding_model(embedding_model_id)

        client = get_qdrant_client()

        if not check_collection_exists(client, PROBLEMS_REFERENCE_COLLECTION_NAME):
            client.create_collection(
                PROBLEMS_REFERENCE_COLLECTION_NAME,
                vectors_config=VectorParams(
                    size=DEFAULT_VECTOR_DIMENSIONS, distance=DEFAULT_VECTOR_DISTANCE
                ),
            )
            need_to_initialize_db = True

        os.makedirs(LOCAL_QDRANT_PATH, exist_ok=True)

        # TODO temp. Need to close and reopen client to avoid RuntimeError: Storage folder /data/qdrant_db is already accessed by another instance of Qdrant client. If you require concurrent access, use Qdrant server instead.
        #   Better solution is to use Qdrant server instead of local file storage, but I'm not sure I can run Docker Compose in Hugging Face Spaces.
        client.close()
        _vector_db_instance = QdrantVectorStore.from_existing_collection(
            # client=client,
            # TODO temp. If this works, go file bug with langchain-qdrant
            # location=":memory:",
            path=LOCAL_QDRANT_PATH,
            collection_name=PROBLEMS_REFERENCE_COLLECTION_NAME,
            embedding=embedding_model,
        )
        # TODO super hacky, but maybe I don't need client anymore? I'll just try to use QdrantVectorStore
        # just really trying not to instantiate a new client to access local path
        # because as long as QdrantVectorStore is instantiated, it will use the same client it created on the backend
        client = None

        if need_to_initialize_db:
            _initialize_vector_db()

        # vector_store = QdrantVectorStore(
        #     client=client,
        #     collection_name=PROBLEMS_REFERENCE_COLLECTION_NAME,
        #     embedding=embedding_model,
        # )

    return _vector_db_instance