File size: 11,399 Bytes
aee2bfd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
# src/utils/drive_document_processor.py
from pathlib import Path
from typing import Dict, List, Any, Tuple
import logging
from fastapi import HTTPException

from src.utils.google_drive_service import GoogleDriveService
from src.utils.document_processor import DocumentProcessor
from src.vectorstores.chroma_vectorstore import ChromaVectorStore
from src.utils.logger import logger

class DriveDocumentProcessor:
    def __init__(
        self,
        google_service_account_path: str,
        folder_id: str,
        temp_dir: str,
        doc_processor: DocumentProcessor
    ):
        """
        Initialize Drive Document Processor
        
        Args:
            google_service_account_path (str): Path to Google service account credentials
            folder_id (str): Google Drive folder ID to process
            temp_dir (str): Directory for temporary files
            doc_processor (DocumentProcessor): Instance of DocumentProcessor
        """
        self.google_drive_service = GoogleDriveService(google_service_account_path)
        self.folder_id = folder_id
        self.temp_dir = Path(temp_dir)
        self.doc_processor = doc_processor
        
        # Create temp directory if it doesn't exist
        self.temp_dir.mkdir(exist_ok=True)
        
        # Define supported MIME types
        self.supported_mime_types = {
            # Google Docs
            'application/vnd.google-apps.document': '.docx',  # Export Google Docs as DOCX
            
            # Microsoft Word Documents
            'application/vnd.openxmlformats-officedocument.wordprocessingml.document': '.docx',
            'application/msword': '.doc',
            
            # Microsoft Excel Documents
            'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': '.xlsx',
            'application/vnd.ms-excel': '.xls',
            
            # Text Documents
            'text/plain': '.txt',
            'text/csv': '.csv',
            'text/markdown': '.md',
            'text/html': '.html',
            'text/xml': '.xml',
            'application/json': '.json',
            'application/rtf': '.rtf',
            
            # PDF Documents
            'application/pdf': '.pdf'
        }

        # Define export MIME types for Google Docs formats
        self.google_docs_export_types = {
            'application/vnd.google-apps.document': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
        }

    async def process_documents(
        self,
        vector_store: ChromaVectorStore
    ) -> Dict[str, Any]:
        """
        Process all documents in the specified Drive folder
        
        Args:
            vector_store (ChromaVectorStore): Vector store instance
            
        Returns:
            Dict[str, Any]: Processing results
        """
        try:
            # Get documents from folder
            files = self.google_drive_service.get_folder_contents(self.folder_id)
            
            processed_files = []
            skipped_files = []
            errors = []
            
            for file in files:
                result = await self._process_single_file(file, vector_store)
                
                if result['status'] == 'processed':
                    processed_files.append(result['data'])
                elif result['status'] == 'skipped':
                    skipped_files.append(result['data'])
                else:  # status == 'error'
                    errors.append(result['data'])
            
            # Clean up temporary directory if empty
            self._cleanup_temp_dir()
            
            return {
                "status": "completed",
                "processed_files": {
                    "count": len(processed_files),
                    "details": processed_files
                },
                "skipped_files": {
                    "count": len(skipped_files),
                    "details": skipped_files
                },
                "errors": {
                    "count": len(errors),
                    "details": errors
                }
            }
            
        except Exception as e:
            logger.error(f"Error processing Drive documents: {str(e)}")
            raise HTTPException(
                status_code=500,
                detail=f"Failed to process drive documents: {str(e)}"
            )

    async def _process_single_file(
        self,
        file: Dict[str, Any],
        vector_store: ChromaVectorStore
    ) -> Dict[str, Any]:
        """Process a single Drive file"""

        mime_type = file.get('mimeType', '')
        
        # Skip if mime type not supported
        if mime_type not in self.supported_mime_types:
            return {
                'status': 'skipped',
                'data': {
                    'name': file['name'],
                    'reason': f'Unsupported mime type: {mime_type}'
                }
            }
        
        try:
            document_id = file['id']
            modified_time = file.get('modifiedTime', 'N/A')  # Get last modified time
            
            # Check if document should be processed
            if self.save_document(document_id, vector_store, modified_time):
                # Download and process file
                temp_file_path = await self._download_and_save_file(
                    file['id'],
                    mime_type
                )
            
                try:
                    # Process document
                    processed_doc = await self.doc_processor.process_document(
                        str(temp_file_path)
                    )
                
                    # Add to vector store
                    self._add_to_vector_store(
                        processed_doc['chunks'],
                        file,
                        mime_type,
                        vector_store
                    )
                
                    return {
                        'status': 'processed',
                        'data': {
                            'name': file['name'],
                            'id': file['id'],
                            'chunks_processed': len(processed_doc['chunks'])
                        }
                    }
                
                finally:
                    # Clean up temporary file
                    if temp_file_path.exists():
                        temp_file_path.unlink()
            else:
                # Return skipped status if document already exists and is up to date
                return {
                    'status': 'skipped',
                    'data': {
                        'name': file['name'],
                        'reason': 'Document already exists in the memory.'
                    }
                }
                    
        except Exception as e:
            logger.error(f"Error processing file {file['name']}: {str(e)}")
            return {
                'status': 'error',
                'data': {
                    'file_name': file['name'],
                    'error': str(e)
                }
            }
                        
        except Exception as e:
            logger.error(f"Error processing file {file['name']}: {str(e)}")
            return {
                'status': 'error',
                'data': {
                    'file_name': file['name'],
                    'error': str(e)
                }
            }

    async def _download_and_save_file(
        self,
        file_id: str,
        mime_type: str
    ) -> Path:
        """Download and save file to temporary location"""
        extension = self.supported_mime_types[mime_type]
        temp_file_path = self.temp_dir / f"{file_id}{extension}"
        
        if mime_type in self.google_docs_export_types:
            # Download Google Doc in the specified export format
            content = self.google_drive_service.export_file(
                file_id,
                self.google_docs_export_types[mime_type]
            )
        else:
            # Download regular file
            content = self.google_drive_service.download_file(file_id)
        
        with open(temp_file_path, 'wb') as f:
            if isinstance(content, str):
                f.write(content.encode('utf-8'))
            else:
                f.write(content)
            
        return temp_file_path

    def _add_to_vector_store(
        self,
        chunks: List[str],
        file: Dict[str, Any],
        mime_type: str,
        vector_store: ChromaVectorStore
    ) -> None:
        """Add processed chunks to vector store"""
        chunk_metadatas = []
        chunk_ids = []
        
        # document_id = file['id']
        modified_time = file.get('modifiedTime', 'N/A')  # Get last modified time
        #self.delete_updated_document(document_id, vector_store, modified_time)


        for i, chunk in enumerate(chunks):
            chunk_id = f"{file['id']}-chunk-{i}"
            chunk_ids.append(chunk_id)
            chunk_metadatas.append({
                "source": file['name'],
                "document_id": file['id'],
                "chunk_index": i,
                "mime_type": mime_type,
                "modified_time": modified_time,
                "total_chunks": len(chunks),
                "file_type": self.supported_mime_types[mime_type],
                "is_google_doc": mime_type.startswith('application/vnd.google-apps')
            })
        
        vector_store.add_documents(
            documents=chunks,
            metadatas=chunk_metadatas,
            ids=chunk_ids
        )
    
    def save_document(self, document_id: str, vector_store: ChromaVectorStore, modified_date: str) -> bool:
        """
        Deletes all chunks of a document if the modified_time does not match the given modified_date.

        Args:
            document_id (str): The ID of the document.
            vector_store (ChromaVectorStore): The Chroma vector store instance.
            modified_date (str): The expected modification date.
        """
        try:
            # Retrieve all chunks for the given document_id
            chunks = vector_store.get_document_chunks(document_id)

            if not chunks:
                logging.warning(f"No chunks found for document_id: {document_id}. Nothing to delete.")
                return True

            # Check the modified_time of the first chunk
            first_chunk_metadata = chunks[0].get("metadata", {})

            if first_chunk_metadata.get("modified_time") != modified_date:
                # If modified_time doesn't match, delete all chunks
                vector_store.delete_document(document_id)
                logging.info(f"Deleted all chunks for document_id: {document_id} due to modified_time mismatch.")
                return True
            else:
                logging.info(f"No deletion needed for document_id: {document_id}, modified_time is unchanged.")
                return False

    
        except Exception as e:
            logging.error(f"Error while deleting chunks for document_id {document_id}: {str(e)}")
            return True


    
    def _cleanup_temp_dir(self) -> None:
        """Clean up temporary directory if empty"""
        if self.temp_dir.exists() and not any(self.temp_dir.iterdir()):
            self.temp_dir.rmdir()