File size: 7,606 Bytes
5debd08
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
import os
import uuid
import json
import logging
from typing import List
from config import save_config
from dotenv import load_dotenv
from log_utils import setup_logging
from langchain_community.document_loaders import PyMuPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma

CONFIG_FILE = 'config.json'

# Load environment variables
load_dotenv()

logger = setup_logging('upload_pdf')

def load_documents(data_path):
    """Load PDF documents from the specified directory."""
    logger.info(f"Starting document loading from directory: {data_path}")
    
    if not os.path.exists(data_path):
        logger.error(f"Directory not found: {data_path}")
        raise FileNotFoundError(f"Directory not found: {data_path}")
    
    directory_loader = DirectoryLoader(
        data_path,
        loader_cls=PyMuPDFLoader,
        show_progress=True
    )
    
    try:
        documents = directory_loader.load()
        logger.info(f"Successfully loaded {len(documents)} documents")
        return documents
    except Exception as e:
        logger.error(f"Error loading documents: {str(e)}", exc_info=True)
        raise

def store_full_content(documents):
    """Store full page content in document metadata."""
    logger.info("Starting to store full page content in metadata")
    try:
        for doc in documents:
            doc.metadata['full_page_content'] = doc.page_content
            logger.debug(f"Stored full content for page {doc.metadata.get('page', 'Unknown')} "
                        f"from {os.path.basename(doc.metadata.get('file_path', 'Unknown'))}")
        logger.info(f"Successfully stored full content for {len(documents)} documents")
        return documents
    except Exception as e:
        logger.error(f"Error storing full content: {str(e)}", exc_info=True)
        raise

def process_documents(documents):
    """Process documents into chunks and add metadata."""
    logger.info("Starting document processing")
    
    try:
        # First store full page content
        documents = store_full_content(documents)
        
        logger.info("Converting documents to chunks")
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=384, chunk_overlap=20)
        chunks = text_splitter.split_documents(documents)
        
        # Add UUID and store full page content in metadata
        for chunk in chunks:
            chunk.metadata['chunk_id'] = str(uuid.uuid4())
            if 'full_page_content' not in chunk.metadata:
                chunk.metadata['full_page_content'] = chunk.metadata.get('full_page_content', chunk.page_content)
        
        logger.info(f"Document processing completed. Total chunks created: {len(chunks)}")
        return chunks
    except Exception as e:
        logger.error(f"Error processing documents: {str(e)}", exc_info=True)
        raise

def initialize_embedding_model():
    """Initialize and return the embedding model."""
    logger.info("Initializing embedding model")
    try:
        embedding_model = HuggingFaceEmbeddings(
            model_name='all-MiniLM-L6-v2',
            model_kwargs={'device': 'cpu'},
            encode_kwargs={'normalize_embeddings': True}
        )
        logger.info("Embedding model initialized successfully")
        return embedding_model
    except Exception as e:
        logger.error(f"Error initializing embedding model: {str(e)}", exc_info=True)
        raise
    
def create_vectordb(chunks, embedding_model, persist_directory, collection_name):
    """Create and persist ChromaDB instance."""
    logger.info(f"Creating Chroma instance with collection name: {collection_name}")
    try:
        vectordb = Chroma.from_documents(
            documents=chunks,
            embedding=embedding_model,
            persist_directory=persist_directory,
            collection_name=collection_name
        )
        vectordb.persist()
        logger.info("Vector database created and persisted successfully")
        return vectordb
    except Exception as e:
        logger.error(f"Error creating vector database: {str(e)}", exc_info=True)
        raise

def update_or_add_pdf(uploaded_file, data_path, persist_directory, collection_name):
    """Add or replace a PDF in the system."""
    logger.info(f"Processing uploaded file: {uploaded_file.name}")
    
    if not uploaded_file.name.lower().endswith('.pdf'):
        logger.warning(f"Rejected non-PDF file: {uploaded_file.name}")
        return False
    
    file_path = os.path.join(data_path, uploaded_file.name)
    
    try:
        # Remove existing PDF if it exists
        if os.path.exists(file_path):
            os.remove(file_path)
            logger.info(f"Deleted existing PDF: {uploaded_file.name}")

        # Save the uploaded PDF
        with open(file_path, 'wb') as f:
            f.write(uploaded_file.getvalue())
        logger.info(f"Saved new PDF: {uploaded_file.name}")

        # Load and process the new document
        documents = load_documents(data_path)
        new_documents = [doc for doc in documents if os.path.basename(doc.metadata.get('file_path', '')) == uploaded_file.name]
        
        if not new_documents:
            logger.error(f"No documents found for uploaded file: {uploaded_file.name}")
            return False

        chunks = process_documents(new_documents)
        embedding_model = initialize_embedding_model()
        
        # Update vector database
        vectordb = Chroma(
            persist_directory=persist_directory,
            embedding_function=embedding_model,
            collection_name=collection_name
        )
        
        # Remove existing vectors
        existing_docs = vectordb.get(where={"source": file_path})
        if existing_docs['ids']:
            vectordb.delete(existing_docs['ids'])
            logger.info(f"Removed existing vectors for {uploaded_file.name}")
        
        # Add new vectors
        vectordb.add_documents(documents=chunks)
        vectordb.persist()
        logger.info(f"Successfully updated {uploaded_file.name} in vector database")
        
        return True
    except Exception as e:
        logger.error(f"Error processing uploaded PDF {uploaded_file.name}: {str(e)}", exc_info=True)
        return False

def main():
    logger.info("Starting PDF processing pipeline")
    try:
        with open(CONFIG_FILE, 'r') as f:
            config = json.load(f)
            
        # Configuration
        data_path = config.get('data_path')
        persist_directory = os.environ.get('PERSIST_DIRECTORY')
        collection_name = config.get('collection_name')
        
        logger.info(f"Using configuration - data_path: {data_path}, "
                   f"persist_directory: {persist_directory}, "
                   f"collection_name: {collection_name}")
        
        # Save configuration
        save_config(data_path, persist_directory, collection_name)
        logger.info("Configuration saved successfully")
        
        # Process pipeline
        documents = load_documents(data_path)
        chunks = process_documents(documents)
        embedding_model = initialize_embedding_model()
        create_vectordb(chunks, embedding_model, persist_directory, collection_name)
        
        logger.info("PDF processing pipeline completed successfully!")
    
    except Exception as e:
        logger.error("Fatal error in PDF processing pipeline", exc_info=True)
        raise

if __name__ == "__main__":
    main()