""" Super early version of a vector store. Just want to make something available for the rest of the app to use. Vector store implementation with singleton pattern to ensure only one instance exists. """ import os import requests import nltk import logging import requests from typing import Optional, List, Union from langchain_qdrant import QdrantVectorStore from langchain_openai.embeddings import OpenAIEmbeddings from langchain_community.document_loaders import DirectoryLoader from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_huggingface import HuggingFaceEmbeddings from qdrant_client import QdrantClient from qdrant_client.models import VectorParams, Distance from langchain.schema import Document from .vectorstore_helpers import ( get_document_hash_as_uuid, enrich_document_metadata, check_collection_exists, ) nltk.download("punkt_tab") nltk.download("averaged_perceptron_tagger_eng") DEFAULT_EMBEDDING_MODEL_ID = "text-embedding-3-small" DEFAULT_VECTOR_DIMENSIONS = 1536 DEFAULT_VECTOR_DISTANCE = Distance.COSINE PROBLEMS_REFERENCE_COLLECTION_NAME = "problems_reference_collection" LOCAL_QDRANT_PATH = "/data/qdrant_db" logger = logging.getLogger(__name__) # Global variable to store the singleton instance _qdrant_client_instance: Optional[QdrantClient] = None _vector_db_instance: Optional[QdrantVectorStore] = None _embedding_model: Optional[Union[OpenAIEmbeddings, HuggingFaceEmbeddings]] = None _embedding_model_id: str = None def _initialize_vector_db(): os.makedirs("static/data", exist_ok=True) html_path = "static/data/langchain_rag_tutorial.html" if not os.path.exists(html_path): url = "https://python.langchain.com/docs/tutorials/rag/" response = requests.get(url) with open(html_path, "w", encoding="utf-8") as f: f.write(response.text) loader = DirectoryLoader("static/data", glob="*.html") documents = loader.load() enriched_docs = [ enrich_document_metadata( doc, title="LangChain RAG Tutorial", type="tutorial", source_url="https://python.langchain.com/docs/tutorials/rag/", description="Official LangChain tutorial on building RAG applications", date_added="2024-03-21", category="documentation", version="1.0", language="en", ) for doc in documents ] text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) split_chunks = text_splitter.split_documents(enriched_docs) store_documents( split_chunks, PROBLEMS_REFERENCE_COLLECTION_NAME, ) def get_qdrant_client(): global _qdrant_client_instance if _qdrant_client_instance is None: if ( os.environ.get("QDRANT_URL") is None or os.environ.get("QDRANT_API_KEY") is None ): logger.warning( "QDRANT_URL or QDRANT_API_KEY is not set. Defaulting to local memory vector store." ) os.makedirs(LOCAL_QDRANT_PATH, exist_ok=True) _qdrant_client_instance = QdrantClient(path=LOCAL_QDRANT_PATH) # _qdrant_client_instance = QdrantClient(":memory:") return _qdrant_client_instance logger.info( f"Attempting to connect to Qdrant at {os.environ.get("QDRANT_URL")}" ) try: _qdrant_client_instance = QdrantClient( url=os.environ.get("QDRANT_URL"), api_key=os.environ.get("QDRANT_API_KEY"), ) logger.info("Successfully connected to Qdrant Cloud") except Exception as e: logger.error(f"Failed to connect to Qdrant Cloud: {str(e)}") raise e return _qdrant_client_instance def get_all_unique_source_of_docs_in_collection( collection_name: str = PROBLEMS_REFERENCE_COLLECTION_NAME, limit: int = 1000, offset: int = 0, ) -> List[Document]: response = get_qdrant_client().scroll( collection_name=collection_name, limit=limit, offset=offset, with_payload=["source"], with_vectors=False, ) result = set() while len(response[0]) > 0: for point in response[0]: if "source" in point.payload: result.add(point.payload["source"]) offset = response[1] response = get_qdrant_client().scroll( collection_name=collection_name, limit=limit, offset=offset + limit, ) return list(result) # TODO This is a dumb hack to get around Qdrant client restrictions when using local file storage. # Instead of using the client directly, we use QdrantVectorStore's similarity search # with a dummy query to get all documents, then extract unique sources. def get_all_unique_source_of_docs_in_collection_DUMB( collection_name: str = PROBLEMS_REFERENCE_COLLECTION_NAME, ) -> List[str]: vector_store = get_vector_db() # Use a very generic query that should match everything docs = vector_store.similarity_search("", k=1000) sources = set() for doc in docs: if doc.metadata and "title" in doc.metadata: sources.add(doc.metadata["title"]) return list(sources) def store_documents( documents: List[Document], collection_name: str, embedding_model_id: str = None, ): global _vector_db_instance assert _vector_db_instance is not None, "Vector database instance not initialized" embedding_model = get_embedding_model(embedding_model_id) client = get_qdrant_client() _vector_db_instance.add_documents( documents=documents, ids=[get_document_hash_as_uuid(doc) for doc in documents], ) def get_embedding_model(embedding_model_id: str = None): """ Factory function that returns a singleton instance of the embedding model. Creates the instance if it doesn't exist. """ global _embedding_model, _embedding_model_id if _embedding_model is None or embedding_model_id != _embedding_model_id: if embedding_model_id is None: _embedding_model = OpenAIEmbeddings(model=DEFAULT_EMBEDDING_MODEL_ID) else: _embedding_model = HuggingFaceEmbeddings(model_name=embedding_model_id) _embedding_model_id = embedding_model_id return _embedding_model def get_vector_db(embedding_model_id: str = None) -> QdrantVectorStore: """ Factory function that returns a singleton instance of the vector database. Creates the instance if it doesn't exist. """ global _vector_db_instance if _vector_db_instance is None: need_to_initialize_db = False embedding_model = get_embedding_model(embedding_model_id) client = get_qdrant_client() if not check_collection_exists(client, PROBLEMS_REFERENCE_COLLECTION_NAME): client.create_collection( PROBLEMS_REFERENCE_COLLECTION_NAME, vectors_config=VectorParams( size=DEFAULT_VECTOR_DIMENSIONS, distance=DEFAULT_VECTOR_DISTANCE ), ) need_to_initialize_db = True os.makedirs(LOCAL_QDRANT_PATH, exist_ok=True) # TODO temp. Need to close and reopen client to avoid RuntimeError: Storage folder /data/qdrant_db is already accessed by another instance of Qdrant client. If you require concurrent access, use Qdrant server instead. # Better solution is to use Qdrant server instead of local file storage, but I'm not sure I can run Docker Compose in Hugging Face Spaces. client.close() _vector_db_instance = QdrantVectorStore.from_existing_collection( # client=client, # TODO temp. If this works, go file bug with langchain-qdrant # location=":memory:", path=LOCAL_QDRANT_PATH, collection_name=PROBLEMS_REFERENCE_COLLECTION_NAME, embedding=embedding_model, ) # TODO super hacky, but maybe I don't need client anymore? I'll just try to use QdrantVectorStore # just really trying not to instantiate a new client to access local path # because as long as QdrantVectorStore is instantiated, it will use the same client it created on the backend client = None if need_to_initialize_db: _initialize_vector_db() # vector_store = QdrantVectorStore( # client=client, # collection_name=PROBLEMS_REFERENCE_COLLECTION_NAME, # embedding=embedding_model, # ) return _vector_db_instance