File size: 7,606 Bytes
5debd08 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 |
import os
import uuid
import json
import logging
from typing import List
from config import save_config
from dotenv import load_dotenv
from log_utils import setup_logging
from langchain_community.document_loaders import PyMuPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
CONFIG_FILE = 'config.json'
# Load environment variables
load_dotenv()
logger = setup_logging('upload_pdf')
def load_documents(data_path):
"""Load PDF documents from the specified directory."""
logger.info(f"Starting document loading from directory: {data_path}")
if not os.path.exists(data_path):
logger.error(f"Directory not found: {data_path}")
raise FileNotFoundError(f"Directory not found: {data_path}")
directory_loader = DirectoryLoader(
data_path,
loader_cls=PyMuPDFLoader,
show_progress=True
)
try:
documents = directory_loader.load()
logger.info(f"Successfully loaded {len(documents)} documents")
return documents
except Exception as e:
logger.error(f"Error loading documents: {str(e)}", exc_info=True)
raise
def store_full_content(documents):
"""Store full page content in document metadata."""
logger.info("Starting to store full page content in metadata")
try:
for doc in documents:
doc.metadata['full_page_content'] = doc.page_content
logger.debug(f"Stored full content for page {doc.metadata.get('page', 'Unknown')} "
f"from {os.path.basename(doc.metadata.get('file_path', 'Unknown'))}")
logger.info(f"Successfully stored full content for {len(documents)} documents")
return documents
except Exception as e:
logger.error(f"Error storing full content: {str(e)}", exc_info=True)
raise
def process_documents(documents):
"""Process documents into chunks and add metadata."""
logger.info("Starting document processing")
try:
# First store full page content
documents = store_full_content(documents)
logger.info("Converting documents to chunks")
text_splitter = RecursiveCharacterTextSplitter(chunk_size=384, chunk_overlap=20)
chunks = text_splitter.split_documents(documents)
# Add UUID and store full page content in metadata
for chunk in chunks:
chunk.metadata['chunk_id'] = str(uuid.uuid4())
if 'full_page_content' not in chunk.metadata:
chunk.metadata['full_page_content'] = chunk.metadata.get('full_page_content', chunk.page_content)
logger.info(f"Document processing completed. Total chunks created: {len(chunks)}")
return chunks
except Exception as e:
logger.error(f"Error processing documents: {str(e)}", exc_info=True)
raise
def initialize_embedding_model():
"""Initialize and return the embedding model."""
logger.info("Initializing embedding model")
try:
embedding_model = HuggingFaceEmbeddings(
model_name='all-MiniLM-L6-v2',
model_kwargs={'device': 'cpu'},
encode_kwargs={'normalize_embeddings': True}
)
logger.info("Embedding model initialized successfully")
return embedding_model
except Exception as e:
logger.error(f"Error initializing embedding model: {str(e)}", exc_info=True)
raise
def create_vectordb(chunks, embedding_model, persist_directory, collection_name):
"""Create and persist ChromaDB instance."""
logger.info(f"Creating Chroma instance with collection name: {collection_name}")
try:
vectordb = Chroma.from_documents(
documents=chunks,
embedding=embedding_model,
persist_directory=persist_directory,
collection_name=collection_name
)
vectordb.persist()
logger.info("Vector database created and persisted successfully")
return vectordb
except Exception as e:
logger.error(f"Error creating vector database: {str(e)}", exc_info=True)
raise
def update_or_add_pdf(uploaded_file, data_path, persist_directory, collection_name):
"""Add or replace a PDF in the system."""
logger.info(f"Processing uploaded file: {uploaded_file.name}")
if not uploaded_file.name.lower().endswith('.pdf'):
logger.warning(f"Rejected non-PDF file: {uploaded_file.name}")
return False
file_path = os.path.join(data_path, uploaded_file.name)
try:
# Remove existing PDF if it exists
if os.path.exists(file_path):
os.remove(file_path)
logger.info(f"Deleted existing PDF: {uploaded_file.name}")
# Save the uploaded PDF
with open(file_path, 'wb') as f:
f.write(uploaded_file.getvalue())
logger.info(f"Saved new PDF: {uploaded_file.name}")
# Load and process the new document
documents = load_documents(data_path)
new_documents = [doc for doc in documents if os.path.basename(doc.metadata.get('file_path', '')) == uploaded_file.name]
if not new_documents:
logger.error(f"No documents found for uploaded file: {uploaded_file.name}")
return False
chunks = process_documents(new_documents)
embedding_model = initialize_embedding_model()
# Update vector database
vectordb = Chroma(
persist_directory=persist_directory,
embedding_function=embedding_model,
collection_name=collection_name
)
# Remove existing vectors
existing_docs = vectordb.get(where={"source": file_path})
if existing_docs['ids']:
vectordb.delete(existing_docs['ids'])
logger.info(f"Removed existing vectors for {uploaded_file.name}")
# Add new vectors
vectordb.add_documents(documents=chunks)
vectordb.persist()
logger.info(f"Successfully updated {uploaded_file.name} in vector database")
return True
except Exception as e:
logger.error(f"Error processing uploaded PDF {uploaded_file.name}: {str(e)}", exc_info=True)
return False
def main():
logger.info("Starting PDF processing pipeline")
try:
with open(CONFIG_FILE, 'r') as f:
config = json.load(f)
# Configuration
data_path = config.get('data_path')
persist_directory = os.environ.get('PERSIST_DIRECTORY')
collection_name = config.get('collection_name')
logger.info(f"Using configuration - data_path: {data_path}, "
f"persist_directory: {persist_directory}, "
f"collection_name: {collection_name}")
# Save configuration
save_config(data_path, persist_directory, collection_name)
logger.info("Configuration saved successfully")
# Process pipeline
documents = load_documents(data_path)
chunks = process_documents(documents)
embedding_model = initialize_embedding_model()
create_vectordb(chunks, embedding_model, persist_directory, collection_name)
logger.info("PDF processing pipeline completed successfully!")
except Exception as e:
logger.error("Fatal error in PDF processing pipeline", exc_info=True)
raise
if __name__ == "__main__":
main() |