Spaces:

mgbam
/

NeuroResearch_AI

Sleeping

App Files Files Community

mgbam commited on Mar 25

Commit

fe062b2

verified ·

1 Parent(s): 5141f31

Create document_manager.py

Browse files

Files changed (1) hide show

document_manager.py +89 -0

document_manager.py ADDED Viewed

	@@ -0,0 +1,89 @@

+# document_manager.py
+import logging
+import hashlib
+import time
+from typing import List, Optional, Any
+import chromadb
+from langchain_openai import OpenAIEmbeddings
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from PIL import Image
+import torch
+from config import ResearchConfig
+logger = logging.getLogger(__name__)
+class QuantumDocumentManager:
+    """
+    Manages creation of Chroma collections from raw document texts.
+    """
+    def __init__(self) -> None:
+        try:
+            self.client = chromadb.PersistentClient(path=ResearchConfig.CHROMA_PATH)
+            logger.info("Initialized PersistentClient for Chroma.")
+        except Exception as e:
+            logger.exception("Error initializing PersistentClient; falling back to in-memory client.")
+            self.client = chromadb.Client()
+        self.embeddings = OpenAIEmbeddings(
+            model="text-embedding-3-large",
+            dimensions=ResearchConfig.EMBEDDING_DIMENSIONS
+        )
+    def create_collection(self, documents: List[str], collection_name: str) -> Any:
+        splitter = RecursiveCharacterTextSplitter(
+            chunk_size=ResearchConfig.CHUNK_SIZE,
+            chunk_overlap=ResearchConfig.CHUNK_OVERLAP,
+            separators=["\n\n", "\n", "|||"]
+        )
+        try:
+            docs = splitter.create_documents(documents)
+            logger.info(f"Created {len(docs)} document chunks for collection '{collection_name}'.")
+        except Exception as e:
+            logger.exception("Error during document splitting.")
+            raise e
+        return chromadb.Chroma.from_documents(
+            documents=docs,
+            embedding=self.embeddings,
+            client=self.client,
+            collection_name=collection_name,
+            ids=[self._document_id(doc.page_content) for doc in docs]
+        )
+    def _document_id(self, content: str) -> str:
+        return f"{hashlib.sha256(content.encode()).hexdigest()[:16]}-{int(time.time())}"
+class ExtendedQuantumDocumentManager(QuantumDocumentManager):
+    """
+    Extends QuantumDocumentManager with multi-modal (image) document handling.
+    Uses dependency injection for CLIP components.
+    """
+    def __init__(self, clip_model: Any, clip_processor: Any) -> None:
+        super().__init__()
+        self.clip_model = clip_model
+        self.clip_processor = clip_processor
+    def create_image_collection(self, image_paths: List[str]) -> Optional[Any]:
+        embeddings = []
+        valid_images = []
+        for img_path in image_paths:
+            try:
+                image = Image.open(img_path)
+                inputs = self.clip_processor(images=image, return_tensors="pt")
+                with torch.no_grad():
+                    emb = self.clip_model.get_image_features(**inputs)
+                embeddings.append(emb.numpy())
+                valid_images.append(img_path)
+            except FileNotFoundError:
+                logger.warning(f"Image file not found: {img_path}. Skipping.")
+            except Exception as e:
+                logger.exception(f"Error processing image {img_path}: {str(e)}")
+        if not embeddings:
+            logger.error("No valid images found for image collection.")
+            return None
+        return chromadb.Chroma.from_embeddings(
+            embeddings=embeddings,
+            documents=valid_images,
+            collection_name="neuro_images"
+        )