In [1]:
import os
import requests
import nltk
import logging
import uuid

from typing import Optional, List
from langchain_community.vectorstores import Qdrant
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_community.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from qdrant_client import QdrantClient
from langchain.schema import Document

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
PROBLEMS_REFERENCE_COLLECTION_NAME = "problems_reference_collection"

In [3]:
_qdrant_client_instance: Optional[QdrantClient] = None

def get_qdrant_client():
    global _qdrant_client_instance

    if _qdrant_client_instance is None:
        QDRANT_URL = "https://f920e9b6-c14c-40e4-9fbe-a2aabf26e2b5.us-east-1-0.aws.cloud.qdrant.io"
        QDRANT_API_KEY = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJhY2Nlc3MiOiJtIn0.lWz54tW8xpFc85mqDRgmj_luvKbEcJhK6hkLVNMEKsk"

        _qdrant_client_instance = QdrantClient(url=QDRANT_URL, api_key=QDRANT_API_KEY)
    return _qdrant_client_instance

In [4]:
DEFAULT_EMBEDDING_MODEL_ID = "text-embedding-3-small"
embedding_model = OpenAIEmbeddings(model=DEFAULT_EMBEDDING_MODEL_ID)

client = get_qdrant_client()

In [5]:
client.get_collections()

CollectionsResponse(collections=[CollectionDescription(name='problems_reference_collection'), CollectionDescription(name='star_charts')])

In [6]:
collection_info = client.get_collection(PROBLEMS_REFERENCE_COLLECTION_NAME)

In [88]:
client.get_collections()

CollectionsResponse(collections=[])

In [7]:
collection_info.vectors_count

In [8]:
def store_documents(
    source: str, documents: List[Document], collection_name: str, client: QdrantClient
):
    client.add(
        collection_name=collection_name,
        documents=documents,
        ids=[str(uuid.uuid4()) for _ in documents],
        payload={"source": source},
    )

def get_docs(embedding_model):
        # Create static/data directory if it doesn't exist
        os.makedirs("static/data", exist_ok=True)

        # Download and save the webpage if it doesn't exist
        html_path = "static/data/langchain_rag_tutorial.html"
        if not os.path.exists(html_path):
            url = "https://python.langchain.com/docs/tutorials/rag/"
            response = requests.get(url)
            with open(html_path, "w", encoding="utf-8") as f:
                f.write(response.text)

        # Load HTML files from static/data directory
        loader = DirectoryLoader("static/data", glob="*.html")
        documents = loader.load()

        # Split documents into chunks
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000, chunk_overlap=200
        )
        split_chunks = text_splitter.split_documents(documents)

        return split_chunks

In [9]:
docs = get_docs(embedding_model)

In [10]:
collection_info.config.params.vectors.size

1536

In [79]:
client.delete_collection("test_collection")
client.delete_collection(PROBLEMS_REFERENCE_COLLECTION_NAME)

True

In [80]:
from qdrant_client.models import VectorParams, Distance
client.create_collection(
    PROBLEMS_REFERENCE_COLLECTION_NAME,
    vectors_config=VectorParams(size=1536, distance=Distance.COSINE),
)

True

In [81]:
vectorstore = Qdrant(
    client=client,
    collection_name=PROBLEMS_REFERENCE_COLLECTION_NAME,
    embeddings=embedding_model
)

In [70]:
import hashlib
import uuid

def get_document_hash_as_uuid(doc):
    # First get the hash of the content
    content_hash = hashlib.sha256(doc.page_content.encode()).hexdigest()
    
    # Convert the first 32 characters of the hash (16 bytes) to UUID
    # UUID requires exactly 16 bytes (32 hex characters)
    uuid_from_hash = uuid.UUID(content_hash[:32])
    
    return str(uuid_from_hash)

In [74]:
vectorstore.add_documents(
    documents=docs,
    ids=[get_document_hash_as_uuid(doc) for doc in docs],
)

['794f95e2-bee6-e5cc-ed64-7c6fe1aef022',
 '6fa4f018-af75-fd5c-a90e-d460b30972ef',
 'ad483089-34a0-5f96-0588-5e288b5964b5',
 'b3e2ac2a-35e0-58b3-d5f5-d98929d6caab',
 '4cf742c8-601a-65f1-cfd6-79876b068503',
 'c74bc126-5e9f-d70c-c0a0-3ec91ea248d0',
 '6366496e-5133-00f3-36d5-cdd91b479aa5',
 'c9e530f6-b567-ffc5-cd44-781022dfcfc6',
 '512f428b-05a7-920b-c2a9-1211406bb7ed',
 '8a092ec4-c4fd-c234-2b7c-bb2e23cbe973',
 '54813989-564e-3b6c-3ef8-451f33cdbf6b',
 'e0611fe9-cade-2e43-6966-82d7a26c0278',
 '1eebf00f-a10a-0d73-982e-cd8844945c18',
 '02002419-ec33-775d-2b85-bc53e12aa3cf',
 '62a197cd-0e46-e846-b7dc-fbd0dc210a31',
 'aa1618aa-b1b1-3b19-e356-81b8b21affd4',
 'db4474e5-7265-f6e5-e242-bca78d1503a1',
 'bebdc4ad-f0a3-6480-5c82-dc8f0ace870b',
 '6dc203ca-380d-a452-84cd-3ee0abdd47b5',
 'fe66ef26-24a3-199c-ba07-3a068a4b1c75',
 '6cb951d3-12c4-0614-a07e-4ac3c4b9b52f',
 'f98f92b9-6d1f-226a-eed7-656edc04db79',
 'ccfef227-20e2-bf29-e740-f66f5e376b72',
 'e53a74e8-118d-2d42-78ed-d6ea3ad93201',
 '9772a884-e0b8-

In [47]:
docs[0]

Document(metadata={'source': 'static/data/langchain_rag_tutorial.html'}, page_content='Tutorials\n\nBuild a Retrieval Augmented Generation (RAG) App: Part 1\n\nBuild a Retrieval Augmented Generation (RAG) App: Part 1\n\nOne of the most powerful applications enabled by LLMs is sophisticated question-answering (Q&A) chatbots. These are applications that can answer questions about specific source information. These applications use a technique known as Retrieval Augmented Generation, or RAG.\n\nThis is a multi-part tutorial:\n\nPart 1 (this guide) introduces RAG and walks through a minimal implementation.\n\nPart 2 extends the implementation to accommodate conversation-style interactions and multi-step retrieval processes.')

In [58]:
problem_reference_collection = client.get_collection(PROBLEMS_REFERENCE_COLLECTION_NAME)
problem_reference_collection


CollectionInfo(status=<CollectionStatus.GREEN: 'green'>, optimizer_status=<OptimizersStatusOneOf.OK: 'ok'>, vectors_count=None, indexed_vectors_count=0, points_count=100, segments_count=2, config=CollectionConfig(params=CollectionParams(vectors=VectorParams(size=1536, distance=<Distance.COSINE: 'Cosine'>, hnsw_config=None, quantization_config=None, on_disk=None), shard_number=1, sharding_method=None, replication_factor=1, write_consistency_factor=1, read_fan_out_factor=None, on_disk_payload=True, sparse_vectors=None), hnsw_config=HnswConfig(m=16, ef_construct=100, full_scan_threshold=10000, max_indexing_threads=0, on_disk=False, payload_m=None), optimizer_config=OptimizersConfig(deleted_threshold=0.2, vacuum_min_vector_number=1000, default_segment_number=0, max_segment_size=None, memmap_threshold=None, indexing_threshold=20000, flush_interval_sec=5, max_optimization_threads=None), wal_config=WalConfig(wal_capacity_mb=32, wal_segments_ahead=0), quantization_config=None), payload_schema=

In [57]:
result = vectorstore.similarity_search("What is the capital of France?")

In [53]:
result[0].metadata

{'source': 'static/data/langchain_rag_tutorial.html',
 '_id': '7072fce1-91f3-43f8-bd1c-2a2efebf258c',
 '_collection_name': 'problems_reference_collection'}

In [82]:
def enrich_document_metadata(doc: Document, **additional_metadata) -> Document:
    """Add additional metadata to a document while preserving original metadata."""
    doc.metadata.update(additional_metadata)
    return doc

enriched_docs = [
            enrich_document_metadata(
                doc,
                title="LangChain RAG Tutorial",
                # type="tutorial",
                source_url="https://python.langchain.com/docs/tutorials/rag/",
                description="Official LangChain tutorial on building RAG applications",
            ) for doc in docs
        ]

In [78]:
enriched_docs[0]

Document(metadata={'source': 'static/data/langchain_rag_tutorial.html', 'title': 'LangChain RAG Tutorial', 'type': 'tutorial', 'source_url': 'https://python.langchain.com/docs/tutorials/rag/', 'description': 'Official LangChain tutorial on building RAG applications'}, page_content='Tutorials\n\nBuild a Retrieval Augmented Generation (RAG) App: Part 1\n\nBuild a Retrieval Augmented Generation (RAG) App: Part 1\n\nOne of the most powerful applications enabled by LLMs is sophisticated question-answering (Q&A) chatbots. These are applications that can answer questions about specific source information. These applications use a technique known as Retrieval Augmented Generation, or RAG.\n\nThis is a multi-part tutorial:\n\nPart 1 (this guide) introduces RAG and walks through a minimal implementation.\n\nPart 2 extends the implementation to accommodate conversation-style interactions and multi-step retrieval processes.')

In [83]:
vectorstore.add_documents(
    documents=enriched_docs,
    ids=[get_document_hash_as_uuid(doc) for doc in docs],
)

['794f95e2-bee6-e5cc-ed64-7c6fe1aef022',
 '6fa4f018-af75-fd5c-a90e-d460b30972ef',
 'ad483089-34a0-5f96-0588-5e288b5964b5',
 'b3e2ac2a-35e0-58b3-d5f5-d98929d6caab',
 '4cf742c8-601a-65f1-cfd6-79876b068503',
 'c74bc126-5e9f-d70c-c0a0-3ec91ea248d0',
 '6366496e-5133-00f3-36d5-cdd91b479aa5',
 'c9e530f6-b567-ffc5-cd44-781022dfcfc6',
 '512f428b-05a7-920b-c2a9-1211406bb7ed',
 '8a092ec4-c4fd-c234-2b7c-bb2e23cbe973',
 '54813989-564e-3b6c-3ef8-451f33cdbf6b',
 'e0611fe9-cade-2e43-6966-82d7a26c0278',
 '1eebf00f-a10a-0d73-982e-cd8844945c18',
 '02002419-ec33-775d-2b85-bc53e12aa3cf',
 '62a197cd-0e46-e846-b7dc-fbd0dc210a31',
 'aa1618aa-b1b1-3b19-e356-81b8b21affd4',
 'db4474e5-7265-f6e5-e242-bca78d1503a1',
 'bebdc4ad-f0a3-6480-5c82-dc8f0ace870b',
 '6dc203ca-380d-a452-84cd-3ee0abdd47b5',
 'fe66ef26-24a3-199c-ba07-3a068a4b1c75',
 '6cb951d3-12c4-0614-a07e-4ac3c4b9b52f',
 'f98f92b9-6d1f-226a-eed7-656edc04db79',
 'ccfef227-20e2-bf29-e740-f66f5e376b72',
 'e53a74e8-118d-2d42-78ed-d6ea3ad93201',
 '9772a884-e0b8-

In [84]:
result = vectorstore.similarity_search("What is the capital of France?")

In [87]:
result[0]

Document(metadata={'source': 'static/data/langchain_rag_tutorial.html', 'title': 'LangChain RAG Tutorial', 'type': 'tutorial', 'source_url': 'https://python.langchain.com/docs/tutorials/rag/', 'description': 'Official LangChain tutorial on building RAG applications', '_id': '2d3b4ed2-70ec-4118-c800-b6f7a48f7b81', '_collection_name': 'problems_reference_collection'}, page_content='code writing mode with a different system message.\\nSystem message:\'), Document(id=\'1fcc2736-30f4-4ef6-90f2-c64af92118cb\', metadata={\'source\': \'https://lilianweng.github.io/posts/2023-06-23-agent/\', \'start_index\': 35127, \'section\': \'end\'}, page_content=\'"content": "You will get instructions for code to write.\\\\nYou will write a very long answer. Make sure that every detail of the architecture is, in the end, implemented as code.\\\\nMake sure that every detail of the architecture is, in the end, implemented as code.\\\\n\\\\nThink step by step and reason yourself to the right decisions to make

In [None]:
# function to check if PROBLEMS_REFERENCE_COLLECTION_NAME exists. If not, create it.
def check_collection_exists(collection_name):
    return client.get_collection(collection_name) is not None

if not check_collection_exists(PROBLEMS_REFERENCE_COLLECTION_NAME):
    client.create_collection(
        PROBLEMS_REFERENCE_COLLECTION_NAME,
        vectors_config=VectorParams(size=1536, distance=Distance.COSINE),
    )
