Spaces:

Rsr2425
/

SimpliFi

Sleeping

App Files Files Community

Rsr2425 commited on Mar 27

Commit

654e910

1 Parent(s): 895b645

Fixed vectorstore code and got it working locally

Browse files

Files changed (7) hide show

Dockerfile +4 -0
Dockerfile.test +4 -0
backend/app/vectorstore.py +79 -44
backend/app/vectorstore_helpers.py +6 -2
backend/tests/test_vectorstore.py +70 -2
pyproject.toml +2 -1
test_vectorstore_code.ipynb +20 -0

Dockerfile CHANGED Viewed

@@ -15,6 +15,10 @@ WORKDIR /app
 RUN mkdir -p /app/static/data
 # Create a non-root user
 RUN useradd -m -u 1000 user
 RUN chown -R user:user /app

 RUN mkdir -p /app/static/data
+# # Add DNS configuration
+# RUN echo "nameserver 8.8.8.8" > /etc/resolv.conf && \
+#     echo "nameserver 8.8.4.4" >> /etc/resolv.conf
 # Create a non-root user
 RUN useradd -m -u 1000 user
 RUN chown -R user:user /app

Dockerfile.test CHANGED Viewed

@@ -11,6 +11,10 @@ RUN npm run build
 # Use Python image with uv pre-installed
 FROM ghcr.io/astral-sh/uv:python3.12-bookworm-slim
 # Set up Node.js and npm
 RUN apt-get update && apt-get install -y \
     curl \

 # Use Python image with uv pre-installed
 FROM ghcr.io/astral-sh/uv:python3.12-bookworm-slim
+# Add DNS configuration
+# RUN echo "nameserver 8.8.8.8" > /etc/resolv.conf && \
+#     echo "nameserver 8.8.4.4" >> /etc/resolv.conf
 # Set up Node.js and npm
 RUN apt-get update && apt-get install -y \
     curl \

backend/app/vectorstore.py CHANGED Viewed

@@ -8,11 +8,10 @@ import os
 import requests
 import nltk
 import logging
-import uuid
-import hashlib
-from typing import Optional, List
-from langchain_community.vectorstores import Qdrant
 from langchain_openai.embeddings import OpenAIEmbeddings
 from langchain_community.document_loaders import DirectoryLoader
 from langchain.text_splitter import RecursiveCharacterTextSplitter
@@ -39,9 +38,8 @@ logger = logging.getLogger(__name__)
 # Global variable to store the singleton instance
 _qdrant_client_instance: Optional[QdrantClient] = None
-_vector_db_instance: Optional[Qdrant] = None
-# TODO fix bug. There's a logical error where if you change the embedding model, the vector db instance might not updated
-#   to match the new embedding model.
 _embedding_model_id: str = None
@@ -59,15 +57,25 @@ def _get_qdrant_client():
             os.makedirs(LOCAL_QDRANT_PATH, exist_ok=True)
             _qdrant_client_instance = QdrantClient(path=LOCAL_QDRANT_PATH)
-        QDRANT_URL = os.environ.get("QDRANT_URL")
-        QDRANT_API_KEY = os.environ.get("QDRANT_API_KEY")
-        _qdrant_client_instance = QdrantClient(url=QDRANT_URL, api_key=QDRANT_API_KEY)
     return _qdrant_client_instance
-def _initialize_vector_db(embedding_model):
     os.makedirs("static/data", exist_ok=True)
     html_path = "static/data/langchain_rag_tutorial.html"
@@ -91,7 +99,6 @@ def _initialize_vector_db(embedding_model):
             category="documentation",
             version="1.0",
             language="en",
-            original_source=doc.metadata.get("source"),
         )
         for doc in documents
     ]
@@ -99,11 +106,9 @@ def _initialize_vector_db(embedding_model):
     text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
     split_chunks = text_splitter.split_documents(enriched_docs)
-    client = _get_qdrant_client()
     store_documents(
         split_chunks,
         PROBLEMS_REFERENCE_COLLECTION_NAME,
-        client,
     )
@@ -134,32 +139,38 @@ def get_all_unique_source_docs_in_collection(
 def store_documents(
     documents: List[Document],
     collection_name: str,
-    client: QdrantClient,
-    embedding_model=None,
 ):
-    if embedding_model is None:
-        embedding_model = OpenAIEmbeddings(model=DEFAULT_EMBEDDING_MODEL_ID)
-    if not check_collection_exists(client, collection_name):
-        client.create_collection(
-            collection_name,
-            vectors_config=VectorParams(
-                size=DEFAULT_VECTOR_DIMENSIONS, distance=DEFAULT_VECTOR_DISTANCE
-            ),
-        )
-    vectorstore = Qdrant(
-        client=client, collection_name=collection_name, embeddings=embedding_model
-    )
-    vectorstore.add_documents(
         documents=documents,
         ids=[get_document_hash_as_uuid(doc) for doc in documents],
     )
-# TODO already probably exposing too much by returning a Qdrant object here
-def get_vector_db(embedding_model_id: str = None) -> Qdrant:
     """
     Factory function that returns a singleton instance of the vector database.
     Creates the instance if it doesn't exist.
@@ -167,21 +178,45 @@ def get_vector_db(embedding_model_id: str = None) -> Qdrant:
     global _vector_db_instance
     if _vector_db_instance is None:
-        embedding_model = None
-        if embedding_model_id is None:
-            embedding_model = OpenAIEmbeddings(model=DEFAULT_EMBEDDING_MODEL_ID)
-        else:
-            embedding_model = HuggingFaceEmbeddings(model_name=embedding_model_id)
         client = _get_qdrant_client()
-        collection_info = client.get_collection(PROBLEMS_REFERENCE_COLLECTION_NAME)
-        if collection_info.vectors_count is None or collection_info.vectors_count == 0:
-            _initialize_vector_db(embedding_model)
-        _vector_db_instance = Qdrant.from_existing_collection(
             collection_name=PROBLEMS_REFERENCE_COLLECTION_NAME,
-            embedding_model=embedding_model,
-            client=client,
         )
     return _vector_db_instance

 import requests
 import nltk
 import logging
+import requests
+from typing import Optional, List, Union
+from langchain_qdrant import QdrantVectorStore
 from langchain_openai.embeddings import OpenAIEmbeddings
 from langchain_community.document_loaders import DirectoryLoader
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 # Global variable to store the singleton instance
 _qdrant_client_instance: Optional[QdrantClient] = None
+_vector_db_instance: Optional[QdrantVectorStore] = None
+_embedding_model: Optional[Union[OpenAIEmbeddings, HuggingFaceEmbeddings]] = None
 _embedding_model_id: str = None
             os.makedirs(LOCAL_QDRANT_PATH, exist_ok=True)
             _qdrant_client_instance = QdrantClient(path=LOCAL_QDRANT_PATH)
+            # _qdrant_client_instance = QdrantClient(":memory:")
+            return _qdrant_client_instance
+        logger.info(
+            f"Attempting to connect to Qdrant at {os.environ.get("QDRANT_URL")}"
+        )
+        try:
+            _qdrant_client_instance = QdrantClient(
+                url=os.environ.get("QDRANT_URL"),
+                api_key=os.environ.get("QDRANT_API_KEY"),
+            )
+            logger.info("Successfully connected to Qdrant Cloud")
+        except Exception as e:
+            logger.error(f"Failed to connect to Qdrant Cloud: {str(e)}")
+            raise e
     return _qdrant_client_instance
+def _initialize_vector_db():
     os.makedirs("static/data", exist_ok=True)
     html_path = "static/data/langchain_rag_tutorial.html"
             category="documentation",
             version="1.0",
             language="en",
         )
         for doc in documents
     ]
     text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
     split_chunks = text_splitter.split_documents(enriched_docs)
     store_documents(
         split_chunks,
         PROBLEMS_REFERENCE_COLLECTION_NAME,
     )
 def store_documents(
     documents: List[Document],
     collection_name: str,
+    embedding_model_id: str = None,
 ):
+    global _vector_db_instance
+    assert _vector_db_instance is not None, "Vector database instance not initialized"
+    embedding_model = get_embedding_model(embedding_model_id)
+    client = _get_qdrant_client()
+    _vector_db_instance.add_documents(
         documents=documents,
         ids=[get_document_hash_as_uuid(doc) for doc in documents],
     )
+def get_embedding_model(embedding_model_id: str = None):
+    """
+    Factory function that returns a singleton instance of the embedding model.
+    Creates the instance if it doesn't exist.
+    """
+    global _embedding_model, _embedding_model_id
+    if _embedding_model is None or embedding_model_id != _embedding_model_id:
+        if embedding_model_id is None:
+            _embedding_model = OpenAIEmbeddings(model=DEFAULT_EMBEDDING_MODEL_ID)
+        else:
+            _embedding_model = HuggingFaceEmbeddings(model_name=embedding_model_id)
+        _embedding_model_id = embedding_model_id
+    return _embedding_model
+def get_vector_db(embedding_model_id: str = None) -> QdrantVectorStore:
     """
     Factory function that returns a singleton instance of the vector database.
     Creates the instance if it doesn't exist.
     global _vector_db_instance
     if _vector_db_instance is None:
+        need_to_initialize_db = False
+        embedding_model = get_embedding_model(embedding_model_id)
         client = _get_qdrant_client()
+        if not check_collection_exists(client, PROBLEMS_REFERENCE_COLLECTION_NAME):
+            client.create_collection(
+                PROBLEMS_REFERENCE_COLLECTION_NAME,
+                vectors_config=VectorParams(
+                    size=DEFAULT_VECTOR_DIMENSIONS, distance=DEFAULT_VECTOR_DISTANCE
+                ),
+            )
+            need_to_initialize_db = True
+        os.makedirs(LOCAL_QDRANT_PATH, exist_ok=True)
+        # TODO temp. Need to close and reopen client to avoid RuntimeError: Storage folder /data/qdrant_db is already accessed by another instance of Qdrant client. If you require concurrent access, use Qdrant server instead.
+        #   Better solution is to use Qdrant server instead of local file storage, but I'm not sure I can run Docker Compose in Hugging Face Spaces.
+        client.close()
+        _vector_db_instance = QdrantVectorStore.from_existing_collection(
+            # client=client,
+            # TODO temp. If this works, go file bug with langchain-qdrant
+            # location=":memory:",
+            path=LOCAL_QDRANT_PATH,
             collection_name=PROBLEMS_REFERENCE_COLLECTION_NAME,
+            embedding=embedding_model,
         )
+        # TODO super hacky, but maybe I don't need client anymore? I'll just try to use QdrantVectorStore
+        # just really trying not to instantiate a new client to access local path
+        # because as long as QdrantVectorStore is instantiated, it will use the same client it created on the backend
+        client = None
+        if need_to_initialize_db:
+            _initialize_vector_db()
+        # vector_store = QdrantVectorStore(
+        #     client=client,
+        #     collection_name=PROBLEMS_REFERENCE_COLLECTION_NAME,
+        #     embedding=embedding_model,
+        # )
     return _vector_db_instance

backend/app/vectorstore_helpers.py CHANGED Viewed

@@ -7,8 +7,12 @@ from typing import List
 def check_collection_exists(client: QdrantClient, collection_name: str) -> bool:
-    """Check if a collection exists in Qdrant."""
-    return client.get_collection(collection_name) is not None
 def get_document_hash_as_uuid(doc):

 def check_collection_exists(client: QdrantClient, collection_name: str) -> bool:
+    try:
+        # this is dumb, but it works. Not sure why get_collection raises an error if the collection doesn't exist.
+        client.get_collection(collection_name) is not None
+        return True
+    except ValueError:
+        return False
 def get_document_hash_as_uuid(doc):

backend/tests/test_vectorstore.py CHANGED Viewed

@@ -1,6 +1,10 @@
 import os
 from langchain.schema import Document
-from backend.app.vectorstore import get_vector_db
 def test_directory_creation():
@@ -44,5 +48,69 @@ def test_vector_db_singleton():
     instance1 = get_vector_db()
     instance2 = get_vector_db()
-    # Verify they are the same object
     assert instance1 is instance2

 import os
+import socket
+import pytest
+import requests
 from langchain.schema import Document
+from backend.app.vectorstore import get_vector_db, _get_qdrant_client
 def test_directory_creation():
     instance1 = get_vector_db()
     instance2 = get_vector_db()
     assert instance1 is instance2
+def test_qdrant_cloud_connection():
+    """Test basic connectivity to Qdrant Cloud"""
+    # Skip test if not configured for cloud
+    if not os.environ.get("QDRANT_URL") or not os.environ.get("QDRANT_API_KEY"):
+        pytest.skip("Qdrant Cloud credentials not configured")
+    try:
+        # Print URL for debugging (excluding any path components)
+        qdrant_url = os.environ.get("QDRANT_URL", "")
+        print(f"Attempting to connect to Qdrant at: {qdrant_url}")
+        # Try to parse the URL components
+        from urllib.parse import urlparse
+        parsed_url = urlparse(qdrant_url)
+        print(f"Scheme: {parsed_url.scheme}")
+        print(f"Hostname: {parsed_url.hostname}")
+        print(f"Port: {parsed_url.port}")
+        print(f"Path: {parsed_url.path}")
+        client = _get_qdrant_client()
+        client.get_collections()
+        assert True, "Connection successful"
+    except Exception as e:
+        assert False, f"Failed to connect to Qdrant Cloud: {str(e)}"
+def test_external_connectivity():
+    """Test basic external connectivity and DNS resolution.
+    Test needed since Docker gave an issue with this before. Couldn't resolve Qdrant host.
+    """
+    # Skip test if not configured for cloud
+    if not os.environ.get("QDRANT_URL") or not os.environ.get("QDRANT_API_KEY"):
+        pytest.skip("Qdrant Cloud credentials not configured")
+    # Test DNS resolution first
+    try:
+        # Try to resolve google.com
+        google_ip = socket.gethostbyname("google.com")
+        print(f"Successfully resolved google.com to {google_ip}")
+        # If we have Qdrant URL, try to resolve that too
+        qdrant_url = os.environ.get("QDRANT_URL", "")
+        if qdrant_url:
+            qdrant_host = (
+                qdrant_url.replace("https://", "").replace("http://", "").split("/")[0]
+            )
+            print(f"Qdrant host: {qdrant_host}")
+            qdrant_ip = socket.gethostbyname(qdrant_host)
+            print(f"Successfully resolved Qdrant host {qdrant_host}")
+    except socket.gaierror as e:
+        assert False, f"DNS resolution failed: {str(e)}"
+    # Test HTTP connectivity
+    try:
+        response = requests.get("https://www.google.com", timeout=5)
+        assert (
+            response.status_code == 200
+        ), "Expected successful response from google.com"
+    except requests.exceptions.RequestException as e:
+        assert False, f"Failed to connect to google.com: {str(e)}"

pyproject.toml CHANGED Viewed

@@ -24,7 +24,7 @@ dependencies = [
     "pytest-dotenv>=0.5.2",
     "unstructured",
     "haystack-ai==2.0.1",
-    "qdrant-client==1.8.2",
     "qdrant-haystack==3.3.1",
     "ipykernel",
     "sentence-transformers>=3.4.1",
@@ -35,6 +35,7 @@ dependencies = [
     "black>=25.1.0",
     "scrapy==2.12.0",
     "fastembed==0.6.0",
 ]
 [tool.setuptools]

     "pytest-dotenv>=0.5.2",
     "unstructured",
     "haystack-ai==2.0.1",
+    "qdrant-client==1.13.3",
     "qdrant-haystack==3.3.1",
     "ipykernel",
     "sentence-transformers>=3.4.1",
     "black>=25.1.0",
     "scrapy==2.12.0",
     "fastembed==0.6.0",
+    "langchain-qdrant==0.2.0",
 ]
 [tool.setuptools]

test_vectorstore_code.ipynb CHANGED Viewed

@@ -100,6 +100,26 @@
     "collection_info = client.get_collection(PROBLEMS_REFERENCE_COLLECTION_NAME)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 7,

     "collection_info = client.get_collection(PROBLEMS_REFERENCE_COLLECTION_NAME)"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 88,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "CollectionsResponse(collections=[])"
+      ]
+     },
+     "execution_count": 88,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "client.get_collections()"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 7,