mgbam commited on
Commit
fe062b2
·
verified ·
1 Parent(s): 5141f31

Create document_manager.py

Browse files
Files changed (1) hide show
  1. document_manager.py +89 -0
document_manager.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # document_manager.py
2
+
3
+ import logging
4
+ import hashlib
5
+ import time
6
+ from typing import List, Optional, Any
7
+
8
+ import chromadb
9
+ from langchain_openai import OpenAIEmbeddings
10
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
11
+ from PIL import Image
12
+ import torch
13
+
14
+ from config import ResearchConfig
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+ class QuantumDocumentManager:
19
+ """
20
+ Manages creation of Chroma collections from raw document texts.
21
+ """
22
+ def __init__(self) -> None:
23
+ try:
24
+ self.client = chromadb.PersistentClient(path=ResearchConfig.CHROMA_PATH)
25
+ logger.info("Initialized PersistentClient for Chroma.")
26
+ except Exception as e:
27
+ logger.exception("Error initializing PersistentClient; falling back to in-memory client.")
28
+ self.client = chromadb.Client()
29
+ self.embeddings = OpenAIEmbeddings(
30
+ model="text-embedding-3-large",
31
+ dimensions=ResearchConfig.EMBEDDING_DIMENSIONS
32
+ )
33
+
34
+ def create_collection(self, documents: List[str], collection_name: str) -> Any:
35
+ splitter = RecursiveCharacterTextSplitter(
36
+ chunk_size=ResearchConfig.CHUNK_SIZE,
37
+ chunk_overlap=ResearchConfig.CHUNK_OVERLAP,
38
+ separators=["\n\n", "\n", "|||"]
39
+ )
40
+ try:
41
+ docs = splitter.create_documents(documents)
42
+ logger.info(f"Created {len(docs)} document chunks for collection '{collection_name}'.")
43
+ except Exception as e:
44
+ logger.exception("Error during document splitting.")
45
+ raise e
46
+ return chromadb.Chroma.from_documents(
47
+ documents=docs,
48
+ embedding=self.embeddings,
49
+ client=self.client,
50
+ collection_name=collection_name,
51
+ ids=[self._document_id(doc.page_content) for doc in docs]
52
+ )
53
+
54
+ def _document_id(self, content: str) -> str:
55
+ return f"{hashlib.sha256(content.encode()).hexdigest()[:16]}-{int(time.time())}"
56
+
57
+ class ExtendedQuantumDocumentManager(QuantumDocumentManager):
58
+ """
59
+ Extends QuantumDocumentManager with multi-modal (image) document handling.
60
+ Uses dependency injection for CLIP components.
61
+ """
62
+ def __init__(self, clip_model: Any, clip_processor: Any) -> None:
63
+ super().__init__()
64
+ self.clip_model = clip_model
65
+ self.clip_processor = clip_processor
66
+
67
+ def create_image_collection(self, image_paths: List[str]) -> Optional[Any]:
68
+ embeddings = []
69
+ valid_images = []
70
+ for img_path in image_paths:
71
+ try:
72
+ image = Image.open(img_path)
73
+ inputs = self.clip_processor(images=image, return_tensors="pt")
74
+ with torch.no_grad():
75
+ emb = self.clip_model.get_image_features(**inputs)
76
+ embeddings.append(emb.numpy())
77
+ valid_images.append(img_path)
78
+ except FileNotFoundError:
79
+ logger.warning(f"Image file not found: {img_path}. Skipping.")
80
+ except Exception as e:
81
+ logger.exception(f"Error processing image {img_path}: {str(e)}")
82
+ if not embeddings:
83
+ logger.error("No valid images found for image collection.")
84
+ return None
85
+ return chromadb.Chroma.from_embeddings(
86
+ embeddings=embeddings,
87
+ documents=valid_images,
88
+ collection_name="neuro_images"
89
+ )