Spaces:

bacancydataprophets
/

Smart-PDF-Search

Sleeping

App Files Files Community

Smart-PDF-Search / upload_pdf.py

Avanisha

Upload 14 files

5debd08 verified about 2 months ago

raw

history blame contribute delete

7.61 kB

	import os
	import uuid
	import json
	import logging
	from typing import List
	from config import save_config
	from dotenv import load_dotenv
	from log_utils import setup_logging
	from langchain_community.document_loaders import PyMuPDFLoader, DirectoryLoader
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain_community.embeddings import HuggingFaceEmbeddings
	from langchain_community.vectorstores import Chroma

	CONFIG_FILE = 'config.json'

	# Load environment variables
	load_dotenv()

	logger = setup_logging('upload_pdf')

	def load_documents(data_path):
	"""Load PDF documents from the specified directory."""
	logger.info(f"Starting document loading from directory: {data_path}")

	if not os.path.exists(data_path):
	logger.error(f"Directory not found: {data_path}")
	raise FileNotFoundError(f"Directory not found: {data_path}")

	directory_loader = DirectoryLoader(
	data_path,
	loader_cls=PyMuPDFLoader,
	show_progress=True
	)

	try:
	documents = directory_loader.load()
	logger.info(f"Successfully loaded {len(documents)} documents")
	return documents
	except Exception as e:
	logger.error(f"Error loading documents: {str(e)}", exc_info=True)
	raise

	def store_full_content(documents):
	"""Store full page content in document metadata."""
	logger.info("Starting to store full page content in metadata")
	try:
	for doc in documents:
	doc.metadata['full_page_content'] = doc.page_content
	logger.debug(f"Stored full content for page {doc.metadata.get('page', 'Unknown')} "
	f"from {os.path.basename(doc.metadata.get('file_path', 'Unknown'))}")
	logger.info(f"Successfully stored full content for {len(documents)} documents")
	return documents
	except Exception as e:
	logger.error(f"Error storing full content: {str(e)}", exc_info=True)
	raise

	def process_documents(documents):
	"""Process documents into chunks and add metadata."""
	logger.info("Starting document processing")

	try:
	# First store full page content
	documents = store_full_content(documents)

	logger.info("Converting documents to chunks")
	text_splitter = RecursiveCharacterTextSplitter(chunk_size=384, chunk_overlap=20)
	chunks = text_splitter.split_documents(documents)

	# Add UUID and store full page content in metadata
	for chunk in chunks:
	chunk.metadata['chunk_id'] = str(uuid.uuid4())
	if 'full_page_content' not in chunk.metadata:
	chunk.metadata['full_page_content'] = chunk.metadata.get('full_page_content', chunk.page_content)

	logger.info(f"Document processing completed. Total chunks created: {len(chunks)}")
	return chunks
	except Exception as e:
	logger.error(f"Error processing documents: {str(e)}", exc_info=True)
	raise

	def initialize_embedding_model():
	"""Initialize and return the embedding model."""
	logger.info("Initializing embedding model")
	try:
	embedding_model = HuggingFaceEmbeddings(
	model_name='all-MiniLM-L6-v2',
	model_kwargs={'device': 'cpu'},
	encode_kwargs={'normalize_embeddings': True}
	)
	logger.info("Embedding model initialized successfully")
	return embedding_model
	except Exception as e:
	logger.error(f"Error initializing embedding model: {str(e)}", exc_info=True)
	raise

	def create_vectordb(chunks, embedding_model, persist_directory, collection_name):
	"""Create and persist ChromaDB instance."""
	logger.info(f"Creating Chroma instance with collection name: {collection_name}")
	try:
	vectordb = Chroma.from_documents(
	documents=chunks,
	embedding=embedding_model,
	persist_directory=persist_directory,
	collection_name=collection_name
	)
	vectordb.persist()
	logger.info("Vector database created and persisted successfully")
	return vectordb
	except Exception as e:
	logger.error(f"Error creating vector database: {str(e)}", exc_info=True)
	raise

	def update_or_add_pdf(uploaded_file, data_path, persist_directory, collection_name):
	"""Add or replace a PDF in the system."""
	logger.info(f"Processing uploaded file: {uploaded_file.name}")

	if not uploaded_file.name.lower().endswith('.pdf'):
	logger.warning(f"Rejected non-PDF file: {uploaded_file.name}")
	return False

	file_path = os.path.join(data_path, uploaded_file.name)

	try:
	# Remove existing PDF if it exists
	if os.path.exists(file_path):
	os.remove(file_path)
	logger.info(f"Deleted existing PDF: {uploaded_file.name}")

	# Save the uploaded PDF
	with open(file_path, 'wb') as f:
	f.write(uploaded_file.getvalue())
	logger.info(f"Saved new PDF: {uploaded_file.name}")

	# Load and process the new document
	documents = load_documents(data_path)
	new_documents = [doc for doc in documents if os.path.basename(doc.metadata.get('file_path', '')) == uploaded_file.name]

	if not new_documents:
	logger.error(f"No documents found for uploaded file: {uploaded_file.name}")
	return False

	chunks = process_documents(new_documents)
	embedding_model = initialize_embedding_model()

	# Update vector database
	vectordb = Chroma(
	persist_directory=persist_directory,
	embedding_function=embedding_model,
	collection_name=collection_name
	)

	# Remove existing vectors
	existing_docs = vectordb.get(where={"source": file_path})
	if existing_docs['ids']:
	vectordb.delete(existing_docs['ids'])
	logger.info(f"Removed existing vectors for {uploaded_file.name}")

	# Add new vectors
	vectordb.add_documents(documents=chunks)
	vectordb.persist()
	logger.info(f"Successfully updated {uploaded_file.name} in vector database")

	return True
	except Exception as e:
	logger.error(f"Error processing uploaded PDF {uploaded_file.name}: {str(e)}", exc_info=True)
	return False

	def main():
	logger.info("Starting PDF processing pipeline")
	try:
	with open(CONFIG_FILE, 'r') as f:
	config = json.load(f)

	# Configuration
	data_path = config.get('data_path')
	persist_directory = os.environ.get('PERSIST_DIRECTORY')
	collection_name = config.get('collection_name')

	logger.info(f"Using configuration - data_path: {data_path}, "
	f"persist_directory: {persist_directory}, "
	f"collection_name: {collection_name}")

	# Save configuration
	save_config(data_path, persist_directory, collection_name)
	logger.info("Configuration saved successfully")

	# Process pipeline
	documents = load_documents(data_path)
	chunks = process_documents(documents)
	embedding_model = initialize_embedding_model()
	create_vectordb(chunks, embedding_model, persist_directory, collection_name)

	logger.info("PDF processing pipeline completed successfully!")

	except Exception as e:
	logger.error("Fatal error in PDF processing pipeline", exc_info=True)
	raise

	if __name__ == "__main__":
	main()