Rsr2425 commited on
Commit
654e910
·
1 Parent(s): 895b645

Fixed vectorstore code and got it working locally

Browse files
Dockerfile CHANGED
@@ -15,6 +15,10 @@ WORKDIR /app
15
 
16
  RUN mkdir -p /app/static/data
17
 
 
 
 
 
18
  # Create a non-root user
19
  RUN useradd -m -u 1000 user
20
  RUN chown -R user:user /app
 
15
 
16
  RUN mkdir -p /app/static/data
17
 
18
+ # # Add DNS configuration
19
+ # RUN echo "nameserver 8.8.8.8" > /etc/resolv.conf && \
20
+ # echo "nameserver 8.8.4.4" >> /etc/resolv.conf
21
+
22
  # Create a non-root user
23
  RUN useradd -m -u 1000 user
24
  RUN chown -R user:user /app
Dockerfile.test CHANGED
@@ -11,6 +11,10 @@ RUN npm run build
11
  # Use Python image with uv pre-installed
12
  FROM ghcr.io/astral-sh/uv:python3.12-bookworm-slim
13
 
 
 
 
 
14
  # Set up Node.js and npm
15
  RUN apt-get update && apt-get install -y \
16
  curl \
 
11
  # Use Python image with uv pre-installed
12
  FROM ghcr.io/astral-sh/uv:python3.12-bookworm-slim
13
 
14
+ # Add DNS configuration
15
+ # RUN echo "nameserver 8.8.8.8" > /etc/resolv.conf && \
16
+ # echo "nameserver 8.8.4.4" >> /etc/resolv.conf
17
+
18
  # Set up Node.js and npm
19
  RUN apt-get update && apt-get install -y \
20
  curl \
backend/app/vectorstore.py CHANGED
@@ -8,11 +8,10 @@ import os
8
  import requests
9
  import nltk
10
  import logging
11
- import uuid
12
- import hashlib
13
 
14
- from typing import Optional, List
15
- from langchain_community.vectorstores import Qdrant
16
  from langchain_openai.embeddings import OpenAIEmbeddings
17
  from langchain_community.document_loaders import DirectoryLoader
18
  from langchain.text_splitter import RecursiveCharacterTextSplitter
@@ -39,9 +38,8 @@ logger = logging.getLogger(__name__)
39
 
40
  # Global variable to store the singleton instance
41
  _qdrant_client_instance: Optional[QdrantClient] = None
42
- _vector_db_instance: Optional[Qdrant] = None
43
- # TODO fix bug. There's a logical error where if you change the embedding model, the vector db instance might not updated
44
- # to match the new embedding model.
45
  _embedding_model_id: str = None
46
 
47
 
@@ -59,15 +57,25 @@ def _get_qdrant_client():
59
 
60
  os.makedirs(LOCAL_QDRANT_PATH, exist_ok=True)
61
  _qdrant_client_instance = QdrantClient(path=LOCAL_QDRANT_PATH)
 
 
62
 
63
- QDRANT_URL = os.environ.get("QDRANT_URL")
64
- QDRANT_API_KEY = os.environ.get("QDRANT_API_KEY")
65
-
66
- _qdrant_client_instance = QdrantClient(url=QDRANT_URL, api_key=QDRANT_API_KEY)
 
 
 
 
 
 
 
 
67
  return _qdrant_client_instance
68
 
69
 
70
- def _initialize_vector_db(embedding_model):
71
  os.makedirs("static/data", exist_ok=True)
72
 
73
  html_path = "static/data/langchain_rag_tutorial.html"
@@ -91,7 +99,6 @@ def _initialize_vector_db(embedding_model):
91
  category="documentation",
92
  version="1.0",
93
  language="en",
94
- original_source=doc.metadata.get("source"),
95
  )
96
  for doc in documents
97
  ]
@@ -99,11 +106,9 @@ def _initialize_vector_db(embedding_model):
99
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
100
  split_chunks = text_splitter.split_documents(enriched_docs)
101
 
102
- client = _get_qdrant_client()
103
  store_documents(
104
  split_chunks,
105
  PROBLEMS_REFERENCE_COLLECTION_NAME,
106
- client,
107
  )
108
 
109
 
@@ -134,32 +139,38 @@ def get_all_unique_source_docs_in_collection(
134
  def store_documents(
135
  documents: List[Document],
136
  collection_name: str,
137
- client: QdrantClient,
138
- embedding_model=None,
139
  ):
140
- if embedding_model is None:
141
- embedding_model = OpenAIEmbeddings(model=DEFAULT_EMBEDDING_MODEL_ID)
142
-
143
- if not check_collection_exists(client, collection_name):
144
- client.create_collection(
145
- collection_name,
146
- vectors_config=VectorParams(
147
- size=DEFAULT_VECTOR_DIMENSIONS, distance=DEFAULT_VECTOR_DISTANCE
148
- ),
149
- )
150
 
151
- vectorstore = Qdrant(
152
- client=client, collection_name=collection_name, embeddings=embedding_model
153
- )
154
 
155
- vectorstore.add_documents(
156
  documents=documents,
157
  ids=[get_document_hash_as_uuid(doc) for doc in documents],
158
  )
159
 
160
 
161
- # TODO already probably exposing too much by returning a Qdrant object here
162
- def get_vector_db(embedding_model_id: str = None) -> Qdrant:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
163
  """
164
  Factory function that returns a singleton instance of the vector database.
165
  Creates the instance if it doesn't exist.
@@ -167,21 +178,45 @@ def get_vector_db(embedding_model_id: str = None) -> Qdrant:
167
  global _vector_db_instance
168
 
169
  if _vector_db_instance is None:
170
- embedding_model = None
171
- if embedding_model_id is None:
172
- embedding_model = OpenAIEmbeddings(model=DEFAULT_EMBEDDING_MODEL_ID)
173
- else:
174
- embedding_model = HuggingFaceEmbeddings(model_name=embedding_model_id)
175
 
176
  client = _get_qdrant_client()
177
- collection_info = client.get_collection(PROBLEMS_REFERENCE_COLLECTION_NAME)
178
- if collection_info.vectors_count is None or collection_info.vectors_count == 0:
179
- _initialize_vector_db(embedding_model)
180
 
181
- _vector_db_instance = Qdrant.from_existing_collection(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
182
  collection_name=PROBLEMS_REFERENCE_COLLECTION_NAME,
183
- embedding_model=embedding_model,
184
- client=client,
185
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
186
 
187
  return _vector_db_instance
 
8
  import requests
9
  import nltk
10
  import logging
11
+ import requests
 
12
 
13
+ from typing import Optional, List, Union
14
+ from langchain_qdrant import QdrantVectorStore
15
  from langchain_openai.embeddings import OpenAIEmbeddings
16
  from langchain_community.document_loaders import DirectoryLoader
17
  from langchain.text_splitter import RecursiveCharacterTextSplitter
 
38
 
39
  # Global variable to store the singleton instance
40
  _qdrant_client_instance: Optional[QdrantClient] = None
41
+ _vector_db_instance: Optional[QdrantVectorStore] = None
42
+ _embedding_model: Optional[Union[OpenAIEmbeddings, HuggingFaceEmbeddings]] = None
 
43
  _embedding_model_id: str = None
44
 
45
 
 
57
 
58
  os.makedirs(LOCAL_QDRANT_PATH, exist_ok=True)
59
  _qdrant_client_instance = QdrantClient(path=LOCAL_QDRANT_PATH)
60
+ # _qdrant_client_instance = QdrantClient(":memory:")
61
+ return _qdrant_client_instance
62
 
63
+ logger.info(
64
+ f"Attempting to connect to Qdrant at {os.environ.get("QDRANT_URL")}"
65
+ )
66
+ try:
67
+ _qdrant_client_instance = QdrantClient(
68
+ url=os.environ.get("QDRANT_URL"),
69
+ api_key=os.environ.get("QDRANT_API_KEY"),
70
+ )
71
+ logger.info("Successfully connected to Qdrant Cloud")
72
+ except Exception as e:
73
+ logger.error(f"Failed to connect to Qdrant Cloud: {str(e)}")
74
+ raise e
75
  return _qdrant_client_instance
76
 
77
 
78
+ def _initialize_vector_db():
79
  os.makedirs("static/data", exist_ok=True)
80
 
81
  html_path = "static/data/langchain_rag_tutorial.html"
 
99
  category="documentation",
100
  version="1.0",
101
  language="en",
 
102
  )
103
  for doc in documents
104
  ]
 
106
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
107
  split_chunks = text_splitter.split_documents(enriched_docs)
108
 
 
109
  store_documents(
110
  split_chunks,
111
  PROBLEMS_REFERENCE_COLLECTION_NAME,
 
112
  )
113
 
114
 
 
139
  def store_documents(
140
  documents: List[Document],
141
  collection_name: str,
142
+ embedding_model_id: str = None,
 
143
  ):
144
+ global _vector_db_instance
145
+ assert _vector_db_instance is not None, "Vector database instance not initialized"
 
 
 
 
 
 
 
 
146
 
147
+ embedding_model = get_embedding_model(embedding_model_id)
148
+ client = _get_qdrant_client()
 
149
 
150
+ _vector_db_instance.add_documents(
151
  documents=documents,
152
  ids=[get_document_hash_as_uuid(doc) for doc in documents],
153
  )
154
 
155
 
156
+ def get_embedding_model(embedding_model_id: str = None):
157
+ """
158
+ Factory function that returns a singleton instance of the embedding model.
159
+ Creates the instance if it doesn't exist.
160
+ """
161
+ global _embedding_model, _embedding_model_id
162
+
163
+ if _embedding_model is None or embedding_model_id != _embedding_model_id:
164
+ if embedding_model_id is None:
165
+ _embedding_model = OpenAIEmbeddings(model=DEFAULT_EMBEDDING_MODEL_ID)
166
+ else:
167
+ _embedding_model = HuggingFaceEmbeddings(model_name=embedding_model_id)
168
+ _embedding_model_id = embedding_model_id
169
+
170
+ return _embedding_model
171
+
172
+
173
+ def get_vector_db(embedding_model_id: str = None) -> QdrantVectorStore:
174
  """
175
  Factory function that returns a singleton instance of the vector database.
176
  Creates the instance if it doesn't exist.
 
178
  global _vector_db_instance
179
 
180
  if _vector_db_instance is None:
181
+ need_to_initialize_db = False
182
+ embedding_model = get_embedding_model(embedding_model_id)
 
 
 
183
 
184
  client = _get_qdrant_client()
 
 
 
185
 
186
+ if not check_collection_exists(client, PROBLEMS_REFERENCE_COLLECTION_NAME):
187
+ client.create_collection(
188
+ PROBLEMS_REFERENCE_COLLECTION_NAME,
189
+ vectors_config=VectorParams(
190
+ size=DEFAULT_VECTOR_DIMENSIONS, distance=DEFAULT_VECTOR_DISTANCE
191
+ ),
192
+ )
193
+ need_to_initialize_db = True
194
+
195
+ os.makedirs(LOCAL_QDRANT_PATH, exist_ok=True)
196
+
197
+ # TODO temp. Need to close and reopen client to avoid RuntimeError: Storage folder /data/qdrant_db is already accessed by another instance of Qdrant client. If you require concurrent access, use Qdrant server instead.
198
+ # Better solution is to use Qdrant server instead of local file storage, but I'm not sure I can run Docker Compose in Hugging Face Spaces.
199
+ client.close()
200
+ _vector_db_instance = QdrantVectorStore.from_existing_collection(
201
+ # client=client,
202
+ # TODO temp. If this works, go file bug with langchain-qdrant
203
+ # location=":memory:",
204
+ path=LOCAL_QDRANT_PATH,
205
  collection_name=PROBLEMS_REFERENCE_COLLECTION_NAME,
206
+ embedding=embedding_model,
 
207
  )
208
+ # TODO super hacky, but maybe I don't need client anymore? I'll just try to use QdrantVectorStore
209
+ # just really trying not to instantiate a new client to access local path
210
+ # because as long as QdrantVectorStore is instantiated, it will use the same client it created on the backend
211
+ client = None
212
+
213
+ if need_to_initialize_db:
214
+ _initialize_vector_db()
215
+
216
+ # vector_store = QdrantVectorStore(
217
+ # client=client,
218
+ # collection_name=PROBLEMS_REFERENCE_COLLECTION_NAME,
219
+ # embedding=embedding_model,
220
+ # )
221
 
222
  return _vector_db_instance
backend/app/vectorstore_helpers.py CHANGED
@@ -7,8 +7,12 @@ from typing import List
7
 
8
 
9
  def check_collection_exists(client: QdrantClient, collection_name: str) -> bool:
10
- """Check if a collection exists in Qdrant."""
11
- return client.get_collection(collection_name) is not None
 
 
 
 
12
 
13
 
14
  def get_document_hash_as_uuid(doc):
 
7
 
8
 
9
  def check_collection_exists(client: QdrantClient, collection_name: str) -> bool:
10
+ try:
11
+ # this is dumb, but it works. Not sure why get_collection raises an error if the collection doesn't exist.
12
+ client.get_collection(collection_name) is not None
13
+ return True
14
+ except ValueError:
15
+ return False
16
 
17
 
18
  def get_document_hash_as_uuid(doc):
backend/tests/test_vectorstore.py CHANGED
@@ -1,6 +1,10 @@
1
  import os
 
 
 
 
2
  from langchain.schema import Document
3
- from backend.app.vectorstore import get_vector_db
4
 
5
 
6
  def test_directory_creation():
@@ -44,5 +48,69 @@ def test_vector_db_singleton():
44
  instance1 = get_vector_db()
45
  instance2 = get_vector_db()
46
 
47
- # Verify they are the same object
48
  assert instance1 is instance2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
+ import socket
3
+ import pytest
4
+ import requests
5
+
6
  from langchain.schema import Document
7
+ from backend.app.vectorstore import get_vector_db, _get_qdrant_client
8
 
9
 
10
  def test_directory_creation():
 
48
  instance1 = get_vector_db()
49
  instance2 = get_vector_db()
50
 
 
51
  assert instance1 is instance2
52
+
53
+
54
+ def test_qdrant_cloud_connection():
55
+ """Test basic connectivity to Qdrant Cloud"""
56
+ # Skip test if not configured for cloud
57
+ if not os.environ.get("QDRANT_URL") or not os.environ.get("QDRANT_API_KEY"):
58
+
59
+ pytest.skip("Qdrant Cloud credentials not configured")
60
+
61
+ try:
62
+ # Print URL for debugging (excluding any path components)
63
+ qdrant_url = os.environ.get("QDRANT_URL", "")
64
+ print(f"Attempting to connect to Qdrant at: {qdrant_url}")
65
+
66
+ # Try to parse the URL components
67
+ from urllib.parse import urlparse
68
+
69
+ parsed_url = urlparse(qdrant_url)
70
+ print(f"Scheme: {parsed_url.scheme}")
71
+ print(f"Hostname: {parsed_url.hostname}")
72
+ print(f"Port: {parsed_url.port}")
73
+ print(f"Path: {parsed_url.path}")
74
+
75
+ client = _get_qdrant_client()
76
+ client.get_collections()
77
+ assert True, "Connection successful"
78
+ except Exception as e:
79
+ assert False, f"Failed to connect to Qdrant Cloud: {str(e)}"
80
+
81
+
82
+ def test_external_connectivity():
83
+ """Test basic external connectivity and DNS resolution.
84
+ Test needed since Docker gave an issue with this before. Couldn't resolve Qdrant host.
85
+ """
86
+
87
+ # Skip test if not configured for cloud
88
+ if not os.environ.get("QDRANT_URL") or not os.environ.get("QDRANT_API_KEY"):
89
+ pytest.skip("Qdrant Cloud credentials not configured")
90
+
91
+ # Test DNS resolution first
92
+ try:
93
+ # Try to resolve google.com
94
+ google_ip = socket.gethostbyname("google.com")
95
+ print(f"Successfully resolved google.com to {google_ip}")
96
+
97
+ # If we have Qdrant URL, try to resolve that too
98
+ qdrant_url = os.environ.get("QDRANT_URL", "")
99
+ if qdrant_url:
100
+ qdrant_host = (
101
+ qdrant_url.replace("https://", "").replace("http://", "").split("/")[0]
102
+ )
103
+ print(f"Qdrant host: {qdrant_host}")
104
+ qdrant_ip = socket.gethostbyname(qdrant_host)
105
+ print(f"Successfully resolved Qdrant host {qdrant_host}")
106
+ except socket.gaierror as e:
107
+ assert False, f"DNS resolution failed: {str(e)}"
108
+
109
+ # Test HTTP connectivity
110
+ try:
111
+ response = requests.get("https://www.google.com", timeout=5)
112
+ assert (
113
+ response.status_code == 200
114
+ ), "Expected successful response from google.com"
115
+ except requests.exceptions.RequestException as e:
116
+ assert False, f"Failed to connect to google.com: {str(e)}"
pyproject.toml CHANGED
@@ -24,7 +24,7 @@ dependencies = [
24
  "pytest-dotenv>=0.5.2",
25
  "unstructured",
26
  "haystack-ai==2.0.1",
27
- "qdrant-client==1.8.2",
28
  "qdrant-haystack==3.3.1",
29
  "ipykernel",
30
  "sentence-transformers>=3.4.1",
@@ -35,6 +35,7 @@ dependencies = [
35
  "black>=25.1.0",
36
  "scrapy==2.12.0",
37
  "fastembed==0.6.0",
 
38
  ]
39
 
40
  [tool.setuptools]
 
24
  "pytest-dotenv>=0.5.2",
25
  "unstructured",
26
  "haystack-ai==2.0.1",
27
+ "qdrant-client==1.13.3",
28
  "qdrant-haystack==3.3.1",
29
  "ipykernel",
30
  "sentence-transformers>=3.4.1",
 
35
  "black>=25.1.0",
36
  "scrapy==2.12.0",
37
  "fastembed==0.6.0",
38
+ "langchain-qdrant==0.2.0",
39
  ]
40
 
41
  [tool.setuptools]
test_vectorstore_code.ipynb CHANGED
@@ -100,6 +100,26 @@
100
  "collection_info = client.get_collection(PROBLEMS_REFERENCE_COLLECTION_NAME)"
101
  ]
102
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
  {
104
  "cell_type": "code",
105
  "execution_count": 7,
 
100
  "collection_info = client.get_collection(PROBLEMS_REFERENCE_COLLECTION_NAME)"
101
  ]
102
  },
103
+ {
104
+ "cell_type": "code",
105
+ "execution_count": 88,
106
+ "metadata": {},
107
+ "outputs": [
108
+ {
109
+ "data": {
110
+ "text/plain": [
111
+ "CollectionsResponse(collections=[])"
112
+ ]
113
+ },
114
+ "execution_count": 88,
115
+ "metadata": {},
116
+ "output_type": "execute_result"
117
+ }
118
+ ],
119
+ "source": [
120
+ "client.get_collections()"
121
+ ]
122
+ },
123
  {
124
  "cell_type": "code",
125
  "execution_count": 7,