Smart-PDF-Search / upload_pdf.py
Avanisha's picture
Upload 14 files
5debd08 verified
import os
import uuid
import json
import logging
from typing import List
from config import save_config
from dotenv import load_dotenv
from log_utils import setup_logging
from langchain_community.document_loaders import PyMuPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
CONFIG_FILE = 'config.json'
# Load environment variables
load_dotenv()
logger = setup_logging('upload_pdf')
def load_documents(data_path):
"""Load PDF documents from the specified directory."""
logger.info(f"Starting document loading from directory: {data_path}")
if not os.path.exists(data_path):
logger.error(f"Directory not found: {data_path}")
raise FileNotFoundError(f"Directory not found: {data_path}")
directory_loader = DirectoryLoader(
data_path,
loader_cls=PyMuPDFLoader,
show_progress=True
)
try:
documents = directory_loader.load()
logger.info(f"Successfully loaded {len(documents)} documents")
return documents
except Exception as e:
logger.error(f"Error loading documents: {str(e)}", exc_info=True)
raise
def store_full_content(documents):
"""Store full page content in document metadata."""
logger.info("Starting to store full page content in metadata")
try:
for doc in documents:
doc.metadata['full_page_content'] = doc.page_content
logger.debug(f"Stored full content for page {doc.metadata.get('page', 'Unknown')} "
f"from {os.path.basename(doc.metadata.get('file_path', 'Unknown'))}")
logger.info(f"Successfully stored full content for {len(documents)} documents")
return documents
except Exception as e:
logger.error(f"Error storing full content: {str(e)}", exc_info=True)
raise
def process_documents(documents):
"""Process documents into chunks and add metadata."""
logger.info("Starting document processing")
try:
# First store full page content
documents = store_full_content(documents)
logger.info("Converting documents to chunks")
text_splitter = RecursiveCharacterTextSplitter(chunk_size=384, chunk_overlap=20)
chunks = text_splitter.split_documents(documents)
# Add UUID and store full page content in metadata
for chunk in chunks:
chunk.metadata['chunk_id'] = str(uuid.uuid4())
if 'full_page_content' not in chunk.metadata:
chunk.metadata['full_page_content'] = chunk.metadata.get('full_page_content', chunk.page_content)
logger.info(f"Document processing completed. Total chunks created: {len(chunks)}")
return chunks
except Exception as e:
logger.error(f"Error processing documents: {str(e)}", exc_info=True)
raise
def initialize_embedding_model():
"""Initialize and return the embedding model."""
logger.info("Initializing embedding model")
try:
embedding_model = HuggingFaceEmbeddings(
model_name='all-MiniLM-L6-v2',
model_kwargs={'device': 'cpu'},
encode_kwargs={'normalize_embeddings': True}
)
logger.info("Embedding model initialized successfully")
return embedding_model
except Exception as e:
logger.error(f"Error initializing embedding model: {str(e)}", exc_info=True)
raise
def create_vectordb(chunks, embedding_model, persist_directory, collection_name):
"""Create and persist ChromaDB instance."""
logger.info(f"Creating Chroma instance with collection name: {collection_name}")
try:
vectordb = Chroma.from_documents(
documents=chunks,
embedding=embedding_model,
persist_directory=persist_directory,
collection_name=collection_name
)
vectordb.persist()
logger.info("Vector database created and persisted successfully")
return vectordb
except Exception as e:
logger.error(f"Error creating vector database: {str(e)}", exc_info=True)
raise
def update_or_add_pdf(uploaded_file, data_path, persist_directory, collection_name):
"""Add or replace a PDF in the system."""
logger.info(f"Processing uploaded file: {uploaded_file.name}")
if not uploaded_file.name.lower().endswith('.pdf'):
logger.warning(f"Rejected non-PDF file: {uploaded_file.name}")
return False
file_path = os.path.join(data_path, uploaded_file.name)
try:
# Remove existing PDF if it exists
if os.path.exists(file_path):
os.remove(file_path)
logger.info(f"Deleted existing PDF: {uploaded_file.name}")
# Save the uploaded PDF
with open(file_path, 'wb') as f:
f.write(uploaded_file.getvalue())
logger.info(f"Saved new PDF: {uploaded_file.name}")
# Load and process the new document
documents = load_documents(data_path)
new_documents = [doc for doc in documents if os.path.basename(doc.metadata.get('file_path', '')) == uploaded_file.name]
if not new_documents:
logger.error(f"No documents found for uploaded file: {uploaded_file.name}")
return False
chunks = process_documents(new_documents)
embedding_model = initialize_embedding_model()
# Update vector database
vectordb = Chroma(
persist_directory=persist_directory,
embedding_function=embedding_model,
collection_name=collection_name
)
# Remove existing vectors
existing_docs = vectordb.get(where={"source": file_path})
if existing_docs['ids']:
vectordb.delete(existing_docs['ids'])
logger.info(f"Removed existing vectors for {uploaded_file.name}")
# Add new vectors
vectordb.add_documents(documents=chunks)
vectordb.persist()
logger.info(f"Successfully updated {uploaded_file.name} in vector database")
return True
except Exception as e:
logger.error(f"Error processing uploaded PDF {uploaded_file.name}: {str(e)}", exc_info=True)
return False
def main():
logger.info("Starting PDF processing pipeline")
try:
with open(CONFIG_FILE, 'r') as f:
config = json.load(f)
# Configuration
data_path = config.get('data_path')
persist_directory = os.environ.get('PERSIST_DIRECTORY')
collection_name = config.get('collection_name')
logger.info(f"Using configuration - data_path: {data_path}, "
f"persist_directory: {persist_directory}, "
f"collection_name: {collection_name}")
# Save configuration
save_config(data_path, persist_directory, collection_name)
logger.info("Configuration saved successfully")
# Process pipeline
documents = load_documents(data_path)
chunks = process_documents(documents)
embedding_model = initialize_embedding_model()
create_vectordb(chunks, embedding_model, persist_directory, collection_name)
logger.info("PDF processing pipeline completed successfully!")
except Exception as e:
logger.error("Fatal error in PDF processing pipeline", exc_info=True)
raise
if __name__ == "__main__":
main()