Spaces:

VishnuRamDebyez
/

FastAPI

Sleeping

App Files Files Community

FastAPI / chroma_utils.py

VishnuRamDebyez

Update chroma_utils.py

1f694ab verified 9 months ago

raw

history blame contribute delete

2.16 kB

	from langchain_google_genai import ChatGoogleGenerativeAI
	from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader, UnstructuredHTMLLoader
	from langchain_text_splitters import RecursiveCharacterTextSplitter
	from langchain_google_genai import GoogleGenerativeAIEmbeddings
	from langchain_chroma import Chroma
	from typing import List
	from langchain_core.documents import Document
	import os



	os.environ["GOOGLE_API_KEY"]=os.getenv("GOOGLE_API_KEY")

	text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200, length_function=len)
	embedding_function = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
	vectorstore = Chroma(persist_directory="./chroma_db", embedding_function=embedding_function)

	def load_and_split_document(file_path: str) -> List[Document]:
	if file_path.endswith('.pdf'):
	loader = PyPDFLoader(file_path)
	elif file_path.endswith('.docx'):
	loader = Docx2txtLoader(file_path)
	elif file_path.endswith('.html'):
	loader = UnstructuredHTMLLoader(file_path)
	else:
	raise ValueError(f"Unsupported file type: {file_path}")

	documents = loader.load()
	return text_splitter.split_documents(documents)

	def index_document_to_chroma(file_path: str, file_id: int) -> bool:
	try:
	splits = load_and_split_document(file_path)

	# Add metadata to each split
	for split in splits:
	split.metadata['file_id'] = file_id

	vectorstore.add_documents(splits)
	# vectorstore.persist()
	return True
	except Exception as e:
	print(f"Error indexing document: {e}")
	return False

	def delete_doc_from_chroma(file_id: int):
	try:
	docs = vectorstore.get(where={"file_id": file_id})
	print(f"Found {len(docs['ids'])} document chunks for file_id {file_id}")

	vectorstore._collection.delete(where={"file_id": file_id})
	print(f"Deleted all documents with file_id {file_id}")

	return True
	except Exception as e:
	print(f"Error deleting document with file_id {file_id} from Chroma: {str(e)}")
	return False