Rsr2425 commited on
Commit
459496d
·
1 Parent(s): d877267

Added test nb for new Qdrant code

Browse files
Files changed (1) hide show
  1. test_vectorstore_code.ipynb +586 -0
test_vectorstore_code.ipynb ADDED
@@ -0,0 +1,586 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [
8
+ {
9
+ "name": "stderr",
10
+ "output_type": "stream",
11
+ "text": [
12
+ "/Users/ryanrodriguez/src/Simplify/.venv/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
13
+ " from .autonotebook import tqdm as notebook_tqdm\n"
14
+ ]
15
+ }
16
+ ],
17
+ "source": [
18
+ "import os\n",
19
+ "import requests\n",
20
+ "import nltk\n",
21
+ "import logging\n",
22
+ "import uuid\n",
23
+ "\n",
24
+ "from typing import Optional, List\n",
25
+ "from langchain_community.vectorstores import Qdrant\n",
26
+ "from langchain_openai.embeddings import OpenAIEmbeddings\n",
27
+ "from langchain_community.document_loaders import DirectoryLoader\n",
28
+ "from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
29
+ "from langchain_huggingface import HuggingFaceEmbeddings\n",
30
+ "from qdrant_client import QdrantClient\n",
31
+ "from langchain.schema import Document"
32
+ ]
33
+ },
34
+ {
35
+ "cell_type": "code",
36
+ "execution_count": 2,
37
+ "metadata": {},
38
+ "outputs": [],
39
+ "source": [
40
+ "PROBLEMS_REFERENCE_COLLECTION_NAME = \"problems_reference_collection\""
41
+ ]
42
+ },
43
+ {
44
+ "cell_type": "code",
45
+ "execution_count": 3,
46
+ "metadata": {},
47
+ "outputs": [],
48
+ "source": [
49
+ "_qdrant_client_instance: Optional[QdrantClient] = None\n",
50
+ "\n",
51
+ "def get_qdrant_client():\n",
52
+ " global _qdrant_client_instance\n",
53
+ "\n",
54
+ " if _qdrant_client_instance is None:\n",
55
+ " QDRANT_URL = \"https://f920e9b6-c14c-40e4-9fbe-a2aabf26e2b5.us-east-1-0.aws.cloud.qdrant.io\"\n",
56
+ " QDRANT_API_KEY = \"eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJhY2Nlc3MiOiJtIn0.lWz54tW8xpFc85mqDRgmj_luvKbEcJhK6hkLVNMEKsk\"\n",
57
+ "\n",
58
+ " _qdrant_client_instance = QdrantClient(url=QDRANT_URL, api_key=QDRANT_API_KEY)\n",
59
+ " return _qdrant_client_instance"
60
+ ]
61
+ },
62
+ {
63
+ "cell_type": "code",
64
+ "execution_count": 4,
65
+ "metadata": {},
66
+ "outputs": [],
67
+ "source": [
68
+ "DEFAULT_EMBEDDING_MODEL_ID = \"text-embedding-3-small\"\n",
69
+ "embedding_model = OpenAIEmbeddings(model=DEFAULT_EMBEDDING_MODEL_ID)\n",
70
+ "\n",
71
+ "client = get_qdrant_client()"
72
+ ]
73
+ },
74
+ {
75
+ "cell_type": "code",
76
+ "execution_count": 5,
77
+ "metadata": {},
78
+ "outputs": [
79
+ {
80
+ "data": {
81
+ "text/plain": [
82
+ "CollectionsResponse(collections=[CollectionDescription(name='problems_reference_collection'), CollectionDescription(name='star_charts')])"
83
+ ]
84
+ },
85
+ "execution_count": 5,
86
+ "metadata": {},
87
+ "output_type": "execute_result"
88
+ }
89
+ ],
90
+ "source": [
91
+ "client.get_collections()"
92
+ ]
93
+ },
94
+ {
95
+ "cell_type": "code",
96
+ "execution_count": 6,
97
+ "metadata": {},
98
+ "outputs": [],
99
+ "source": [
100
+ "collection_info = client.get_collection(PROBLEMS_REFERENCE_COLLECTION_NAME)"
101
+ ]
102
+ },
103
+ {
104
+ "cell_type": "code",
105
+ "execution_count": 7,
106
+ "metadata": {},
107
+ "outputs": [],
108
+ "source": [
109
+ "collection_info.vectors_count"
110
+ ]
111
+ },
112
+ {
113
+ "cell_type": "code",
114
+ "execution_count": 8,
115
+ "metadata": {},
116
+ "outputs": [],
117
+ "source": [
118
+ "def store_documents(\n",
119
+ " source: str, documents: List[Document], collection_name: str, client: QdrantClient\n",
120
+ "):\n",
121
+ " client.add(\n",
122
+ " collection_name=collection_name,\n",
123
+ " documents=documents,\n",
124
+ " ids=[str(uuid.uuid4()) for _ in documents],\n",
125
+ " payload={\"source\": source},\n",
126
+ " )\n",
127
+ "\n",
128
+ "def get_docs(embedding_model):\n",
129
+ " # Create static/data directory if it doesn't exist\n",
130
+ " os.makedirs(\"static/data\", exist_ok=True)\n",
131
+ "\n",
132
+ " # Download and save the webpage if it doesn't exist\n",
133
+ " html_path = \"static/data/langchain_rag_tutorial.html\"\n",
134
+ " if not os.path.exists(html_path):\n",
135
+ " url = \"https://python.langchain.com/docs/tutorials/rag/\"\n",
136
+ " response = requests.get(url)\n",
137
+ " with open(html_path, \"w\", encoding=\"utf-8\") as f:\n",
138
+ " f.write(response.text)\n",
139
+ "\n",
140
+ " # Load HTML files from static/data directory\n",
141
+ " loader = DirectoryLoader(\"static/data\", glob=\"*.html\")\n",
142
+ " documents = loader.load()\n",
143
+ "\n",
144
+ " # Split documents into chunks\n",
145
+ " text_splitter = RecursiveCharacterTextSplitter(\n",
146
+ " chunk_size=1000, chunk_overlap=200\n",
147
+ " )\n",
148
+ " split_chunks = text_splitter.split_documents(documents)\n",
149
+ "\n",
150
+ " return split_chunks"
151
+ ]
152
+ },
153
+ {
154
+ "cell_type": "code",
155
+ "execution_count": 9,
156
+ "metadata": {},
157
+ "outputs": [],
158
+ "source": [
159
+ "docs = get_docs(embedding_model)"
160
+ ]
161
+ },
162
+ {
163
+ "cell_type": "code",
164
+ "execution_count": 10,
165
+ "metadata": {},
166
+ "outputs": [
167
+ {
168
+ "data": {
169
+ "text/plain": [
170
+ "1536"
171
+ ]
172
+ },
173
+ "execution_count": 10,
174
+ "metadata": {},
175
+ "output_type": "execute_result"
176
+ }
177
+ ],
178
+ "source": [
179
+ "collection_info.config.params.vectors.size"
180
+ ]
181
+ },
182
+ {
183
+ "cell_type": "code",
184
+ "execution_count": 79,
185
+ "metadata": {},
186
+ "outputs": [
187
+ {
188
+ "data": {
189
+ "text/plain": [
190
+ "True"
191
+ ]
192
+ },
193
+ "execution_count": 79,
194
+ "metadata": {},
195
+ "output_type": "execute_result"
196
+ }
197
+ ],
198
+ "source": [
199
+ "client.delete_collection(\"test_collection\")\n",
200
+ "client.delete_collection(PROBLEMS_REFERENCE_COLLECTION_NAME)"
201
+ ]
202
+ },
203
+ {
204
+ "cell_type": "code",
205
+ "execution_count": 80,
206
+ "metadata": {},
207
+ "outputs": [
208
+ {
209
+ "data": {
210
+ "text/plain": [
211
+ "True"
212
+ ]
213
+ },
214
+ "execution_count": 80,
215
+ "metadata": {},
216
+ "output_type": "execute_result"
217
+ }
218
+ ],
219
+ "source": [
220
+ "from qdrant_client.models import VectorParams, Distance\n",
221
+ "client.create_collection(\n",
222
+ " PROBLEMS_REFERENCE_COLLECTION_NAME,\n",
223
+ " vectors_config=VectorParams(size=1536, distance=Distance.COSINE),\n",
224
+ ")"
225
+ ]
226
+ },
227
+ {
228
+ "cell_type": "code",
229
+ "execution_count": 81,
230
+ "metadata": {},
231
+ "outputs": [],
232
+ "source": [
233
+ "vectorstore = Qdrant(\n",
234
+ " client=client,\n",
235
+ " collection_name=PROBLEMS_REFERENCE_COLLECTION_NAME,\n",
236
+ " embeddings=embedding_model\n",
237
+ ")"
238
+ ]
239
+ },
240
+ {
241
+ "cell_type": "code",
242
+ "execution_count": 70,
243
+ "metadata": {},
244
+ "outputs": [],
245
+ "source": [
246
+ "import hashlib\n",
247
+ "import uuid\n",
248
+ "\n",
249
+ "def get_document_hash_as_uuid(doc):\n",
250
+ " # First get the hash of the content\n",
251
+ " content_hash = hashlib.sha256(doc.page_content.encode()).hexdigest()\n",
252
+ " \n",
253
+ " # Convert the first 32 characters of the hash (16 bytes) to UUID\n",
254
+ " # UUID requires exactly 16 bytes (32 hex characters)\n",
255
+ " uuid_from_hash = uuid.UUID(content_hash[:32])\n",
256
+ " \n",
257
+ " return str(uuid_from_hash)"
258
+ ]
259
+ },
260
+ {
261
+ "cell_type": "code",
262
+ "execution_count": 74,
263
+ "metadata": {},
264
+ "outputs": [
265
+ {
266
+ "data": {
267
+ "text/plain": [
268
+ "['794f95e2-bee6-e5cc-ed64-7c6fe1aef022',\n",
269
+ " '6fa4f018-af75-fd5c-a90e-d460b30972ef',\n",
270
+ " 'ad483089-34a0-5f96-0588-5e288b5964b5',\n",
271
+ " 'b3e2ac2a-35e0-58b3-d5f5-d98929d6caab',\n",
272
+ " '4cf742c8-601a-65f1-cfd6-79876b068503',\n",
273
+ " 'c74bc126-5e9f-d70c-c0a0-3ec91ea248d0',\n",
274
+ " '6366496e-5133-00f3-36d5-cdd91b479aa5',\n",
275
+ " 'c9e530f6-b567-ffc5-cd44-781022dfcfc6',\n",
276
+ " '512f428b-05a7-920b-c2a9-1211406bb7ed',\n",
277
+ " '8a092ec4-c4fd-c234-2b7c-bb2e23cbe973',\n",
278
+ " '54813989-564e-3b6c-3ef8-451f33cdbf6b',\n",
279
+ " 'e0611fe9-cade-2e43-6966-82d7a26c0278',\n",
280
+ " '1eebf00f-a10a-0d73-982e-cd8844945c18',\n",
281
+ " '02002419-ec33-775d-2b85-bc53e12aa3cf',\n",
282
+ " '62a197cd-0e46-e846-b7dc-fbd0dc210a31',\n",
283
+ " 'aa1618aa-b1b1-3b19-e356-81b8b21affd4',\n",
284
+ " 'db4474e5-7265-f6e5-e242-bca78d1503a1',\n",
285
+ " 'bebdc4ad-f0a3-6480-5c82-dc8f0ace870b',\n",
286
+ " '6dc203ca-380d-a452-84cd-3ee0abdd47b5',\n",
287
+ " 'fe66ef26-24a3-199c-ba07-3a068a4b1c75',\n",
288
+ " '6cb951d3-12c4-0614-a07e-4ac3c4b9b52f',\n",
289
+ " 'f98f92b9-6d1f-226a-eed7-656edc04db79',\n",
290
+ " 'ccfef227-20e2-bf29-e740-f66f5e376b72',\n",
291
+ " 'e53a74e8-118d-2d42-78ed-d6ea3ad93201',\n",
292
+ " '9772a884-e0b8-8d73-c464-17e839d691a8',\n",
293
+ " 'dc51dd9c-2467-e0dd-c17a-4f3947770146',\n",
294
+ " '6f1523ed-c6b0-62ba-3261-05f993373adb',\n",
295
+ " '97bad942-3a69-447f-d384-9b9a60f9cf88',\n",
296
+ " '1826ed11-0cff-7ab0-4137-4c17ddd9e7fc',\n",
297
+ " '3f71153e-d378-59d1-03d8-7f1bbe15e4c0',\n",
298
+ " 'c17ea483-30a4-014b-c42f-7c6c44b7b47a',\n",
299
+ " '986c2383-4509-0f92-3834-aeea851a216d',\n",
300
+ " '4164df32-97b3-c1f3-ae38-56008f47c435',\n",
301
+ " '3f0d297f-f62a-a8c0-8d8d-b226788f3a40',\n",
302
+ " '7c4bee9b-93ad-26bb-e49d-770e03276add',\n",
303
+ " '7c211878-b398-83a5-90ce-c7839e7d88d1',\n",
304
+ " '2ccb136c-496b-9e5b-a388-57c1c018e5cb',\n",
305
+ " 'b07a6e2e-05e6-550f-a2db-ade353284be5',\n",
306
+ " '44c41257-7a12-83da-8f44-d7e9b1968d45',\n",
307
+ " '638ab06e-5ac1-134e-ded3-af6536a2b04d',\n",
308
+ " 'a2fb7256-e90a-169f-1cc3-7932b73f0cba',\n",
309
+ " '12ee5cfe-be76-be09-a486-ca4252f5f7cc',\n",
310
+ " '2b38415a-1f29-8cda-8625-7d0b0a1c8c26',\n",
311
+ " 'a377526c-aee9-a842-a990-7f2ccbc7a644',\n",
312
+ " '686ad547-a6ba-8187-22c9-5c312575713a',\n",
313
+ " 'ebebc277-7ba6-7b8b-0368-efee03ccc2d7',\n",
314
+ " '2d3b4ed2-70ec-4118-c800-b6f7a48f7b81',\n",
315
+ " 'b905ba7d-7497-ec41-729b-4b343c98db2c',\n",
316
+ " '299f6d65-39b1-3af4-0bf2-f7fee062f6e2',\n",
317
+ " 'fd62bfd1-9a06-40ee-1ede-0590e9de85dc']"
318
+ ]
319
+ },
320
+ "execution_count": 74,
321
+ "metadata": {},
322
+ "output_type": "execute_result"
323
+ }
324
+ ],
325
+ "source": [
326
+ "vectorstore.add_documents(\n",
327
+ " documents=docs,\n",
328
+ " ids=[get_document_hash_as_uuid(doc) for doc in docs],\n",
329
+ ")"
330
+ ]
331
+ },
332
+ {
333
+ "cell_type": "code",
334
+ "execution_count": 47,
335
+ "metadata": {},
336
+ "outputs": [
337
+ {
338
+ "data": {
339
+ "text/plain": [
340
+ "Document(metadata={'source': 'static/data/langchain_rag_tutorial.html'}, page_content='Tutorials\\n\\nBuild a Retrieval Augmented Generation (RAG) App: Part 1\\n\\nBuild a Retrieval Augmented Generation (RAG) App: Part 1\\n\\nOne of the most powerful applications enabled by LLMs is sophisticated question-answering (Q&A) chatbots. These are applications that can answer questions about specific source information. These applications use a technique known as Retrieval Augmented Generation, or RAG.\\n\\nThis is a multi-part tutorial:\\n\\nPart 1 (this guide) introduces RAG and walks through a minimal implementation.\\n\\nPart 2 extends the implementation to accommodate conversation-style interactions and multi-step retrieval processes.')"
341
+ ]
342
+ },
343
+ "execution_count": 47,
344
+ "metadata": {},
345
+ "output_type": "execute_result"
346
+ }
347
+ ],
348
+ "source": [
349
+ "docs[0]"
350
+ ]
351
+ },
352
+ {
353
+ "cell_type": "code",
354
+ "execution_count": 58,
355
+ "metadata": {},
356
+ "outputs": [
357
+ {
358
+ "data": {
359
+ "text/plain": [
360
+ "CollectionInfo(status=<CollectionStatus.GREEN: 'green'>, optimizer_status=<OptimizersStatusOneOf.OK: 'ok'>, vectors_count=None, indexed_vectors_count=0, points_count=100, segments_count=2, config=CollectionConfig(params=CollectionParams(vectors=VectorParams(size=1536, distance=<Distance.COSINE: 'Cosine'>, hnsw_config=None, quantization_config=None, on_disk=None), shard_number=1, sharding_method=None, replication_factor=1, write_consistency_factor=1, read_fan_out_factor=None, on_disk_payload=True, sparse_vectors=None), hnsw_config=HnswConfig(m=16, ef_construct=100, full_scan_threshold=10000, max_indexing_threads=0, on_disk=False, payload_m=None), optimizer_config=OptimizersConfig(deleted_threshold=0.2, vacuum_min_vector_number=1000, default_segment_number=0, max_segment_size=None, memmap_threshold=None, indexing_threshold=20000, flush_interval_sec=5, max_optimization_threads=None), wal_config=WalConfig(wal_capacity_mb=32, wal_segments_ahead=0), quantization_config=None), payload_schema={})"
361
+ ]
362
+ },
363
+ "execution_count": 58,
364
+ "metadata": {},
365
+ "output_type": "execute_result"
366
+ }
367
+ ],
368
+ "source": [
369
+ "problem_reference_collection = client.get_collection(PROBLEMS_REFERENCE_COLLECTION_NAME)\n",
370
+ "problem_reference_collection\n"
371
+ ]
372
+ },
373
+ {
374
+ "cell_type": "code",
375
+ "execution_count": 57,
376
+ "metadata": {},
377
+ "outputs": [],
378
+ "source": [
379
+ "result = vectorstore.similarity_search(\"What is the capital of France?\")"
380
+ ]
381
+ },
382
+ {
383
+ "cell_type": "code",
384
+ "execution_count": 53,
385
+ "metadata": {},
386
+ "outputs": [
387
+ {
388
+ "data": {
389
+ "text/plain": [
390
+ "{'source': 'static/data/langchain_rag_tutorial.html',\n",
391
+ " '_id': '7072fce1-91f3-43f8-bd1c-2a2efebf258c',\n",
392
+ " '_collection_name': 'problems_reference_collection'}"
393
+ ]
394
+ },
395
+ "execution_count": 53,
396
+ "metadata": {},
397
+ "output_type": "execute_result"
398
+ }
399
+ ],
400
+ "source": [
401
+ "result[0].metadata"
402
+ ]
403
+ },
404
+ {
405
+ "cell_type": "code",
406
+ "execution_count": 82,
407
+ "metadata": {},
408
+ "outputs": [],
409
+ "source": [
410
+ "def enrich_document_metadata(doc: Document, **additional_metadata) -> Document:\n",
411
+ " \"\"\"Add additional metadata to a document while preserving original metadata.\"\"\"\n",
412
+ " doc.metadata.update(additional_metadata)\n",
413
+ " return doc\n",
414
+ "\n",
415
+ "enriched_docs = [\n",
416
+ " enrich_document_metadata(\n",
417
+ " doc,\n",
418
+ " title=\"LangChain RAG Tutorial\",\n",
419
+ " # type=\"tutorial\",\n",
420
+ " source_url=\"https://python.langchain.com/docs/tutorials/rag/\",\n",
421
+ " description=\"Official LangChain tutorial on building RAG applications\",\n",
422
+ " ) for doc in docs\n",
423
+ " ]"
424
+ ]
425
+ },
426
+ {
427
+ "cell_type": "code",
428
+ "execution_count": 78,
429
+ "metadata": {},
430
+ "outputs": [
431
+ {
432
+ "data": {
433
+ "text/plain": [
434
+ "Document(metadata={'source': 'static/data/langchain_rag_tutorial.html', 'title': 'LangChain RAG Tutorial', 'type': 'tutorial', 'source_url': 'https://python.langchain.com/docs/tutorials/rag/', 'description': 'Official LangChain tutorial on building RAG applications'}, page_content='Tutorials\\n\\nBuild a Retrieval Augmented Generation (RAG) App: Part 1\\n\\nBuild a Retrieval Augmented Generation (RAG) App: Part 1\\n\\nOne of the most powerful applications enabled by LLMs is sophisticated question-answering (Q&A) chatbots. These are applications that can answer questions about specific source information. These applications use a technique known as Retrieval Augmented Generation, or RAG.\\n\\nThis is a multi-part tutorial:\\n\\nPart 1 (this guide) introduces RAG and walks through a minimal implementation.\\n\\nPart 2 extends the implementation to accommodate conversation-style interactions and multi-step retrieval processes.')"
435
+ ]
436
+ },
437
+ "execution_count": 78,
438
+ "metadata": {},
439
+ "output_type": "execute_result"
440
+ }
441
+ ],
442
+ "source": [
443
+ "enriched_docs[0]"
444
+ ]
445
+ },
446
+ {
447
+ "cell_type": "code",
448
+ "execution_count": 83,
449
+ "metadata": {},
450
+ "outputs": [
451
+ {
452
+ "data": {
453
+ "text/plain": [
454
+ "['794f95e2-bee6-e5cc-ed64-7c6fe1aef022',\n",
455
+ " '6fa4f018-af75-fd5c-a90e-d460b30972ef',\n",
456
+ " 'ad483089-34a0-5f96-0588-5e288b5964b5',\n",
457
+ " 'b3e2ac2a-35e0-58b3-d5f5-d98929d6caab',\n",
458
+ " '4cf742c8-601a-65f1-cfd6-79876b068503',\n",
459
+ " 'c74bc126-5e9f-d70c-c0a0-3ec91ea248d0',\n",
460
+ " '6366496e-5133-00f3-36d5-cdd91b479aa5',\n",
461
+ " 'c9e530f6-b567-ffc5-cd44-781022dfcfc6',\n",
462
+ " '512f428b-05a7-920b-c2a9-1211406bb7ed',\n",
463
+ " '8a092ec4-c4fd-c234-2b7c-bb2e23cbe973',\n",
464
+ " '54813989-564e-3b6c-3ef8-451f33cdbf6b',\n",
465
+ " 'e0611fe9-cade-2e43-6966-82d7a26c0278',\n",
466
+ " '1eebf00f-a10a-0d73-982e-cd8844945c18',\n",
467
+ " '02002419-ec33-775d-2b85-bc53e12aa3cf',\n",
468
+ " '62a197cd-0e46-e846-b7dc-fbd0dc210a31',\n",
469
+ " 'aa1618aa-b1b1-3b19-e356-81b8b21affd4',\n",
470
+ " 'db4474e5-7265-f6e5-e242-bca78d1503a1',\n",
471
+ " 'bebdc4ad-f0a3-6480-5c82-dc8f0ace870b',\n",
472
+ " '6dc203ca-380d-a452-84cd-3ee0abdd47b5',\n",
473
+ " 'fe66ef26-24a3-199c-ba07-3a068a4b1c75',\n",
474
+ " '6cb951d3-12c4-0614-a07e-4ac3c4b9b52f',\n",
475
+ " 'f98f92b9-6d1f-226a-eed7-656edc04db79',\n",
476
+ " 'ccfef227-20e2-bf29-e740-f66f5e376b72',\n",
477
+ " 'e53a74e8-118d-2d42-78ed-d6ea3ad93201',\n",
478
+ " '9772a884-e0b8-8d73-c464-17e839d691a8',\n",
479
+ " 'dc51dd9c-2467-e0dd-c17a-4f3947770146',\n",
480
+ " '6f1523ed-c6b0-62ba-3261-05f993373adb',\n",
481
+ " '97bad942-3a69-447f-d384-9b9a60f9cf88',\n",
482
+ " '1826ed11-0cff-7ab0-4137-4c17ddd9e7fc',\n",
483
+ " '3f71153e-d378-59d1-03d8-7f1bbe15e4c0',\n",
484
+ " 'c17ea483-30a4-014b-c42f-7c6c44b7b47a',\n",
485
+ " '986c2383-4509-0f92-3834-aeea851a216d',\n",
486
+ " '4164df32-97b3-c1f3-ae38-56008f47c435',\n",
487
+ " '3f0d297f-f62a-a8c0-8d8d-b226788f3a40',\n",
488
+ " '7c4bee9b-93ad-26bb-e49d-770e03276add',\n",
489
+ " '7c211878-b398-83a5-90ce-c7839e7d88d1',\n",
490
+ " '2ccb136c-496b-9e5b-a388-57c1c018e5cb',\n",
491
+ " 'b07a6e2e-05e6-550f-a2db-ade353284be5',\n",
492
+ " '44c41257-7a12-83da-8f44-d7e9b1968d45',\n",
493
+ " '638ab06e-5ac1-134e-ded3-af6536a2b04d',\n",
494
+ " 'a2fb7256-e90a-169f-1cc3-7932b73f0cba',\n",
495
+ " '12ee5cfe-be76-be09-a486-ca4252f5f7cc',\n",
496
+ " '2b38415a-1f29-8cda-8625-7d0b0a1c8c26',\n",
497
+ " 'a377526c-aee9-a842-a990-7f2ccbc7a644',\n",
498
+ " '686ad547-a6ba-8187-22c9-5c312575713a',\n",
499
+ " 'ebebc277-7ba6-7b8b-0368-efee03ccc2d7',\n",
500
+ " '2d3b4ed2-70ec-4118-c800-b6f7a48f7b81',\n",
501
+ " 'b905ba7d-7497-ec41-729b-4b343c98db2c',\n",
502
+ " '299f6d65-39b1-3af4-0bf2-f7fee062f6e2',\n",
503
+ " 'fd62bfd1-9a06-40ee-1ede-0590e9de85dc']"
504
+ ]
505
+ },
506
+ "execution_count": 83,
507
+ "metadata": {},
508
+ "output_type": "execute_result"
509
+ }
510
+ ],
511
+ "source": [
512
+ "vectorstore.add_documents(\n",
513
+ " documents=enriched_docs,\n",
514
+ " ids=[get_document_hash_as_uuid(doc) for doc in docs],\n",
515
+ ")"
516
+ ]
517
+ },
518
+ {
519
+ "cell_type": "code",
520
+ "execution_count": 84,
521
+ "metadata": {},
522
+ "outputs": [],
523
+ "source": [
524
+ "result = vectorstore.similarity_search(\"What is the capital of France?\")"
525
+ ]
526
+ },
527
+ {
528
+ "cell_type": "code",
529
+ "execution_count": 87,
530
+ "metadata": {},
531
+ "outputs": [
532
+ {
533
+ "data": {
534
+ "text/plain": [
535
+ "Document(metadata={'source': 'static/data/langchain_rag_tutorial.html', 'title': 'LangChain RAG Tutorial', 'type': 'tutorial', 'source_url': 'https://python.langchain.com/docs/tutorials/rag/', 'description': 'Official LangChain tutorial on building RAG applications', '_id': '2d3b4ed2-70ec-4118-c800-b6f7a48f7b81', '_collection_name': 'problems_reference_collection'}, page_content='code writing mode with a different system message.\\\\nSystem message:\\'), Document(id=\\'1fcc2736-30f4-4ef6-90f2-c64af92118cb\\', metadata={\\'source\\': \\'https://lilianweng.github.io/posts/2023-06-23-agent/\\', \\'start_index\\': 35127, \\'section\\': \\'end\\'}, page_content=\\'\"content\": \"You will get instructions for code to write.\\\\\\\\nYou will write a very long answer. Make sure that every detail of the architecture is, in the end, implemented as code.\\\\\\\\nMake sure that every detail of the architecture is, in the end, implemented as code.\\\\\\\\n\\\\\\\\nThink step by step and reason yourself to the right decisions to make sure we get it right.\\\\\\\\nYou will first lay out the names of the core classes, functions, methods that will be necessary, as well as a quick comment on their purpose.\\\\\\\\n\\\\\\\\nThen you will output the content of each file including ALL code.\\\\\\\\nEach file must strictly follow a markdown code block format, where the following tokens must be replaced such that\\\\\\\\nFILENAME is the lowercase file name including')"
536
+ ]
537
+ },
538
+ "execution_count": 87,
539
+ "metadata": {},
540
+ "output_type": "execute_result"
541
+ }
542
+ ],
543
+ "source": [
544
+ "result[0]"
545
+ ]
546
+ },
547
+ {
548
+ "cell_type": "code",
549
+ "execution_count": null,
550
+ "metadata": {},
551
+ "outputs": [],
552
+ "source": [
553
+ "# function to check if PROBLEMS_REFERENCE_COLLECTION_NAME exists. If not, create it.\n",
554
+ "def check_collection_exists(collection_name):\n",
555
+ " return client.get_collection(collection_name) is not None\n",
556
+ "\n",
557
+ "if not check_collection_exists(PROBLEMS_REFERENCE_COLLECTION_NAME):\n",
558
+ " client.create_collection(\n",
559
+ " PROBLEMS_REFERENCE_COLLECTION_NAME,\n",
560
+ " vectors_config=VectorParams(size=1536, distance=Distance.COSINE),\n",
561
+ " )\n"
562
+ ]
563
+ }
564
+ ],
565
+ "metadata": {
566
+ "kernelspec": {
567
+ "display_name": ".venv",
568
+ "language": "python",
569
+ "name": "python3"
570
+ },
571
+ "language_info": {
572
+ "codemirror_mode": {
573
+ "name": "ipython",
574
+ "version": 3
575
+ },
576
+ "file_extension": ".py",
577
+ "mimetype": "text/x-python",
578
+ "name": "python",
579
+ "nbconvert_exporter": "python",
580
+ "pygments_lexer": "ipython3",
581
+ "version": "3.12.0"
582
+ }
583
+ },
584
+ "nbformat": 4,
585
+ "nbformat_minor": 2
586
+ }