Spaces:

Rsr2425
/

SimpliFi

Sleeping

App Files Files Community

SimpliFi / backend /app /vectorstore.py

Rsr2425

Did some refactoring in BE API

1ef298a 4 months ago

raw

history blame

2.19 kB

	"""
	Super early version of a vector store. Just want to make something available for the rest of the app to use.

	Vector store implementation with singleton pattern to ensure only one instance exists.
	"""
	import os
	import requests
	import nltk
	from typing import Optional
	from langchain_community.vectorstores import Qdrant
	from langchain_openai.embeddings import OpenAIEmbeddings
	from langchain_community.document_loaders import DirectoryLoader
	from langchain.text_splitter import RecursiveCharacterTextSplitter

	nltk.download('punkt_tab')
	nltk.download('averaged_perceptron_tagger_eng')

	# Global variable to store the singleton instance
	_vector_db_instance: Optional[Qdrant] = None

	def get_vector_db() -> Qdrant:
	"""
	Factory function that returns a singleton instance of the vector database.
	Creates the instance if it doesn't exist.
	"""
	global _vector_db_instance

	if _vector_db_instance is None:
	# Create static/data directory if it doesn't exist
	os.makedirs("static/data", exist_ok=True)

	# Download and save the webpage if it doesn't exist
	html_path = "static/data/langchain_rag_tutorial.html"
	if not os.path.exists(html_path):
	url = "https://python.langchain.com/docs/tutorials/rag/"
	response = requests.get(url)
	with open(html_path, "w", encoding="utf-8") as f:
	f.write(response.text)

	# Initialize embedding model
	embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")

	# Load HTML files from static/data directory
	loader = DirectoryLoader("static/data", glob="*.html")
	documents = loader.load()

	# Split documents into chunks
	text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=1000,
	chunk_overlap=200
	)
	split_chunks = text_splitter.split_documents(documents)

	# Create vector store instance
	_vector_db_instance = Qdrant.from_documents(
	split_chunks,
	embedding_model,
	location=":memory:",
	collection_name="extending_context_window_llama_3",
	)

	return _vector_db_instance