File size: 8,640 Bytes
ed91833 1ef298a ed91833 999f24c ed91833 999f24c 654e910 45884d3 654e910 ed91833 1ef298a 4d17f84 999f24c 45884d3 ed91833 999f24c ed91833 4d17f84 45884d3 999f24c 4d17f84 1ef298a 45884d3 654e910 4d17f84 1ef298a 999f24c 654e910 45884d3 b22f9a0 45884d3 b22f9a0 45884d3 b22f9a0 45884d3 b22f9a0 7d1e4e7 b22f9a0 7d1e4e7 b22f9a0 45884d3 654e910 45884d3 654e910 999f24c 654e910 b22f9a0 999f24c 654e910 45884d3 999f24c 895b645 654e910 1ef298a 999f24c 1ef298a 654e910 1ef298a b22f9a0 1ef298a 654e910 45884d3 654e910 1ef298a 654e910 1ef298a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 |
"""
Super early version of a vector store. Just want to make something available for the rest of the app to use.
Vector store implementation with singleton pattern to ensure only one instance exists.
"""
import os
import requests
import nltk
import logging
import requests
from typing import Optional, List, Union
from langchain_qdrant import QdrantVectorStore
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_community.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance
from langchain.schema import Document
from .vectorstore_helpers import (
get_document_hash_as_uuid,
enrich_document_metadata,
check_collection_exists,
)
nltk.download("punkt_tab")
nltk.download("averaged_perceptron_tagger_eng")
DEFAULT_EMBEDDING_MODEL_ID = "text-embedding-3-small"
DEFAULT_VECTOR_DIMENSIONS = 1536
DEFAULT_VECTOR_DISTANCE = Distance.COSINE
PROBLEMS_REFERENCE_COLLECTION_NAME = "problems_reference_collection"
LOCAL_QDRANT_PATH = "/data/qdrant_db"
logger = logging.getLogger(__name__)
# Global variable to store the singleton instance
_qdrant_client_instance: Optional[QdrantClient] = None
_vector_db_instance: Optional[QdrantVectorStore] = None
_embedding_model: Optional[Union[OpenAIEmbeddings, HuggingFaceEmbeddings]] = None
_embedding_model_id: str = None
def _initialize_vector_db():
os.makedirs("static/data", exist_ok=True)
html_path = "static/data/langchain_rag_tutorial.html"
if not os.path.exists(html_path):
url = "https://python.langchain.com/docs/tutorials/rag/"
response = requests.get(url)
with open(html_path, "w", encoding="utf-8") as f:
f.write(response.text)
loader = DirectoryLoader("static/data", glob="*.html")
documents = loader.load()
enriched_docs = [
enrich_document_metadata(
doc,
title="LangChain RAG Tutorial",
type="tutorial",
source_url="https://python.langchain.com/docs/tutorials/rag/",
description="Official LangChain tutorial on building RAG applications",
date_added="2024-03-21",
category="documentation",
version="1.0",
language="en",
)
for doc in documents
]
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
split_chunks = text_splitter.split_documents(enriched_docs)
store_documents(
split_chunks,
PROBLEMS_REFERENCE_COLLECTION_NAME,
)
def get_qdrant_client():
global _qdrant_client_instance
if _qdrant_client_instance is None:
if (
os.environ.get("QDRANT_URL") is None
or os.environ.get("QDRANT_API_KEY") is None
):
logger.warning(
"QDRANT_URL or QDRANT_API_KEY is not set. Defaulting to local memory vector store."
)
os.makedirs(LOCAL_QDRANT_PATH, exist_ok=True)
_qdrant_client_instance = QdrantClient(path=LOCAL_QDRANT_PATH)
# _qdrant_client_instance = QdrantClient(":memory:")
return _qdrant_client_instance
logger.info(
f"Attempting to connect to Qdrant at {os.environ.get("QDRANT_URL")}"
)
try:
_qdrant_client_instance = QdrantClient(
url=os.environ.get("QDRANT_URL"),
api_key=os.environ.get("QDRANT_API_KEY"),
)
logger.info("Successfully connected to Qdrant Cloud")
except Exception as e:
logger.error(f"Failed to connect to Qdrant Cloud: {str(e)}")
raise e
return _qdrant_client_instance
def get_all_unique_source_of_docs_in_collection(
collection_name: str = PROBLEMS_REFERENCE_COLLECTION_NAME,
limit: int = 1000,
offset: int = 0,
) -> List[Document]:
response = get_qdrant_client().scroll(
collection_name=collection_name,
limit=limit,
offset=offset,
with_payload=["source"],
with_vectors=False,
)
result = set()
while len(response[0]) > 0:
for point in response[0]:
if "source" in point.payload:
result.add(point.payload["source"])
offset = response[1]
response = get_qdrant_client().scroll(
collection_name=collection_name,
limit=limit,
offset=offset + limit,
)
return list(result)
# TODO This is a dumb hack to get around Qdrant client restrictions when using local file storage.
# Instead of using the client directly, we use QdrantVectorStore's similarity search
# with a dummy query to get all documents, then extract unique sources.
def get_all_unique_source_of_docs_in_collection_DUMB(
collection_name: str = PROBLEMS_REFERENCE_COLLECTION_NAME,
) -> List[str]:
vector_store = get_vector_db()
# Use a very generic query that should match everything
docs = vector_store.similarity_search("", k=1000)
sources = set()
for doc in docs:
if doc.metadata and "title" in doc.metadata:
sources.add(doc.metadata["title"])
return list(sources)
def store_documents(
documents: List[Document],
collection_name: str,
embedding_model_id: str = None,
):
global _vector_db_instance
assert _vector_db_instance is not None, "Vector database instance not initialized"
embedding_model = get_embedding_model(embedding_model_id)
client = get_qdrant_client()
_vector_db_instance.add_documents(
documents=documents,
ids=[get_document_hash_as_uuid(doc) for doc in documents],
)
def get_embedding_model(embedding_model_id: str = None):
"""
Factory function that returns a singleton instance of the embedding model.
Creates the instance if it doesn't exist.
"""
global _embedding_model, _embedding_model_id
if _embedding_model is None or embedding_model_id != _embedding_model_id:
if embedding_model_id is None:
_embedding_model = OpenAIEmbeddings(model=DEFAULT_EMBEDDING_MODEL_ID)
else:
_embedding_model = HuggingFaceEmbeddings(model_name=embedding_model_id)
_embedding_model_id = embedding_model_id
return _embedding_model
def get_vector_db(embedding_model_id: str = None) -> QdrantVectorStore:
"""
Factory function that returns a singleton instance of the vector database.
Creates the instance if it doesn't exist.
"""
global _vector_db_instance
if _vector_db_instance is None:
need_to_initialize_db = False
embedding_model = get_embedding_model(embedding_model_id)
client = get_qdrant_client()
if not check_collection_exists(client, PROBLEMS_REFERENCE_COLLECTION_NAME):
client.create_collection(
PROBLEMS_REFERENCE_COLLECTION_NAME,
vectors_config=VectorParams(
size=DEFAULT_VECTOR_DIMENSIONS, distance=DEFAULT_VECTOR_DISTANCE
),
)
need_to_initialize_db = True
os.makedirs(LOCAL_QDRANT_PATH, exist_ok=True)
# TODO temp. Need to close and reopen client to avoid RuntimeError: Storage folder /data/qdrant_db is already accessed by another instance of Qdrant client. If you require concurrent access, use Qdrant server instead.
# Better solution is to use Qdrant server instead of local file storage, but I'm not sure I can run Docker Compose in Hugging Face Spaces.
client.close()
_vector_db_instance = QdrantVectorStore.from_existing_collection(
# client=client,
# TODO temp. If this works, go file bug with langchain-qdrant
# location=":memory:",
path=LOCAL_QDRANT_PATH,
collection_name=PROBLEMS_REFERENCE_COLLECTION_NAME,
embedding=embedding_model,
)
# TODO super hacky, but maybe I don't need client anymore? I'll just try to use QdrantVectorStore
# just really trying not to instantiate a new client to access local path
# because as long as QdrantVectorStore is instantiated, it will use the same client it created on the backend
client = None
if need_to_initialize_db:
_initialize_vector_db()
# vector_store = QdrantVectorStore(
# client=client,
# collection_name=PROBLEMS_REFERENCE_COLLECTION_NAME,
# embedding=embedding_model,
# )
return _vector_db_instance
|