VanKee commited on
Commit
8d27db4
Β·
1 Parent(s): 42d7509

feat: implement hospital-specific customization pipeline with two-stage ANNOY retrieval

Browse files

- Restructure pdf-version to customization/ for hospital-specific deployment
- Add customization_pipeline.py with two-stage retrieval (tag -> document -> chunk)
- Implement ANNOY indices for fast medical concept and chunk similarity search
- Add generate_embeddings.py for building hospital-specific embeddings
- Create test suite validating end-to-end pipeline functionality
- Add customization_requirements.txt with all necessary dependencies
- Update .gitignore to exclude rag_env/ virtual environment

This enables hospitals to deploy their own customized medical RAG systems
with private documents while maintaining the base medical knowledge framework.

Files changed (28) hide show
  1. .gitignore +1 -0
  2. customization/customization_pipeline.py +156 -0
  3. {src/pdf-version β†’ customization}/generate_embeddings.py +14 -18
  4. {src/pdf-version β†’ customization/src}/__init__.py +0 -0
  5. customization/src/data/__init__.py +23 -0
  6. {src/pdf-version β†’ customization/src}/data/loaders.py +0 -0
  7. {src/pdf-version β†’ customization/src}/data/pdf_processing.py +0 -0
  8. {src/pdf-version β†’ customization/src}/demos/__init__.py +0 -0
  9. {src/pdf-version β†’ customization/src}/demos/demo_runner.py +157 -4
  10. {src/pdf-version β†’ customization/src}/indexing/__init__.py +0 -0
  11. customization/src/indexing/annoy_manager.py +392 -0
  12. {src/pdf-version β†’ customization/src}/indexing/document_indexer.py +0 -0
  13. {src/pdf-version β†’ customization/src}/indexing/embedding_creator.py +0 -0
  14. {src/pdf-version β†’ customization/src}/indexing/storage.py +91 -2
  15. {src/pdf-version β†’ customization/src}/models/__init__.py +0 -0
  16. {src/pdf-version β†’ customization/src}/models/embedding_models.py +0 -0
  17. {src/pdf-version β†’ customization/src}/rag/__init__.py +0 -0
  18. {src/pdf-version β†’ customization/src}/rag/medical_rag_pipeline.py +0 -0
  19. {src/pdf-version β†’ customization/src}/retrieval/__init__.py +0 -0
  20. {src/pdf-version β†’ customization/src}/retrieval/chunk_retriever.py +177 -3
  21. {src/pdf-version β†’ customization/src}/retrieval/document_retriever.py +207 -3
  22. {src/pdf-version β†’ customization/src}/utils/__init__.py +0 -0
  23. {src/pdf-version β†’ customization/src}/utils/helpers.py +0 -0
  24. customization/test/test_pipeline.py +117 -0
  25. customization_requirements.txt +188 -0
  26. src/pdf-version/data/__init__.py +0 -15
  27. src/pdf-version/main.py +0 -83
  28. src/pdf-version/oncall_ai.py +0 -55
.gitignore CHANGED
@@ -1,6 +1,7 @@
1
  # 🧠 Virtual environments
2
  genAIvenv/
3
  .final_project_env/
 
4
  .env
5
  .venv
6
  env/
 
1
  # 🧠 Virtual environments
2
  genAIvenv/
3
  .final_project_env/
4
+ rag_env/
5
  .env
6
  .venv
7
  env/
customization/customization_pipeline.py ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Customization Pipeline - Hospital-Specific Document Retrieval
3
+
4
+ This module provides the interface for hospital-specific document processing and retrieval.
5
+ """
6
+
7
+ import sys
8
+ from pathlib import Path
9
+ from typing import List, Dict
10
+
11
+ # Add src directory to Python path
12
+ sys.path.insert(0, str(Path(__file__).parent / 'src'))
13
+
14
+ # Import necessary modules
15
+ from models.embedding_models import load_biomedbert_model
16
+ from data.loaders import load_annotations
17
+ from indexing.document_indexer import build_document_index
18
+ from indexing.embedding_creator import create_tag_embeddings, create_chunk_embeddings
19
+ from indexing.storage import save_document_system, load_document_system_with_annoy
20
+ from retrieval.document_retriever import create_document_tag_mapping
21
+ from retrieval.chunk_retriever import find_relevant_chunks_with_fallback
22
+
23
+
24
+ def build_customization_embeddings():
25
+ """Build embeddings for the hospital-specific documents in the docs folder."""
26
+ print("πŸ₯ Building hospital-specific embeddings...")
27
+
28
+ # Paths
29
+ base_path = Path(__file__).parent
30
+ docs_path = base_path / "docs"
31
+ processing_path = base_path / "processing"
32
+
33
+ # Load model and annotations
34
+ embedding_model = load_biomedbert_model()
35
+ annotations = load_annotations(file_path=str(processing_path / "mapping.json"))
36
+
37
+ if not annotations:
38
+ print("❌ Unable to load annotation data")
39
+ return False
40
+
41
+ # Build document index with chunks
42
+ print("πŸ“„ Processing documents...")
43
+ document_index = build_document_index(
44
+ annotations,
45
+ assets_dir=str(docs_path),
46
+ chunk_size=256,
47
+ chunk_overlap=25
48
+ )
49
+
50
+ # Create embeddings
51
+ print("πŸ”’ Creating embeddings...")
52
+ tag_embeddings = create_tag_embeddings(embedding_model, document_index)
53
+ doc_tag_mapping = create_document_tag_mapping(document_index, tag_embeddings)
54
+ chunk_embeddings = create_chunk_embeddings(embedding_model, document_index)
55
+
56
+ # Save everything
57
+ print("πŸ’Ύ Saving to processing folder...")
58
+ save_document_system(
59
+ document_index,
60
+ tag_embeddings,
61
+ doc_tag_mapping,
62
+ chunk_embeddings,
63
+ output_dir=str(processing_path / "embeddings"),
64
+ build_annoy_indices=True
65
+ )
66
+
67
+ print("βœ… Embeddings built successfully!")
68
+ return True
69
+
70
+
71
+ def retrieve_document_chunks(query: str, top_k: int = 5) -> List[Dict]:
72
+ """Retrieve relevant document chunks using two-stage ANNOY retrieval.
73
+
74
+ Stage 1: Find relevant documents using tag embeddings (medical concepts)
75
+ Stage 2: Find relevant chunks within those documents using chunk embeddings
76
+
77
+ Args:
78
+ query: The search query
79
+ top_k: Number of chunks to retrieve
80
+
81
+ Returns:
82
+ List of dictionaries containing chunk information
83
+ """
84
+ # Load model and existing embeddings
85
+ embedding_model = load_biomedbert_model()
86
+
87
+ # Load from processing folder
88
+ processing_path = Path(__file__).parent / "processing"
89
+
90
+ # Load the saved system with ANNOY indices
91
+ document_index, tag_embeddings, doc_tag_mapping, chunk_embeddings, annoy_manager = \
92
+ load_document_system_with_annoy(
93
+ input_dir=str(processing_path / "embeddings"),
94
+ annoy_dir=str(processing_path / "indices")
95
+ )
96
+
97
+ if annoy_manager is None:
98
+ print("❌ Failed to load ANNOY manager")
99
+ return []
100
+
101
+ # Create query embedding
102
+ query_embedding = embedding_model.encode(query)
103
+
104
+ # Stage 1: Find relevant documents using tag ANNOY index
105
+ print(f"πŸ” Stage 1: Finding relevant documents for query: '{query}'")
106
+ relevant_tags, tag_distances = annoy_manager.search_tags(
107
+ query_embedding=query_embedding,
108
+ n_neighbors=20, # Get more tags to find diverse documents
109
+ include_distances=True
110
+ )
111
+
112
+ # Get documents that contain these relevant tags
113
+ relevant_docs = set()
114
+ for tag in relevant_tags[:10]: # Use top 10 tags
115
+ for doc_name, doc_info in doc_tag_mapping.items():
116
+ if tag in doc_info['tags']:
117
+ relevant_docs.add(doc_name)
118
+
119
+ relevant_docs = list(relevant_docs)
120
+ print(f"βœ… Found {len(relevant_docs)} relevant documents based on medical tags")
121
+
122
+ if not relevant_docs:
123
+ print("❌ No relevant documents found")
124
+ return []
125
+
126
+ # Stage 2: Find relevant chunks within these documents using chunk ANNOY index
127
+ print(f"πŸ” Stage 2: Finding relevant chunks within {len(relevant_docs)} documents")
128
+ chunks, chunk_distances = annoy_manager.search_chunks_in_documents(
129
+ query_embedding=query_embedding,
130
+ document_names=relevant_docs,
131
+ n_neighbors=top_k,
132
+ include_distances=True
133
+ )
134
+
135
+ # Convert ANNOY distances to cosine similarities
136
+ from indexing.annoy_manager import convert_angular_distance_to_cosine_similarity
137
+
138
+ # Format results
139
+ results = []
140
+ for chunk, distance in zip(chunks, chunk_distances):
141
+ # Convert angular distance to cosine similarity
142
+ similarity = convert_angular_distance_to_cosine_similarity(distance)
143
+
144
+ results.append({
145
+ 'document': chunk['document'],
146
+ 'chunk_text': chunk['text'],
147
+ 'score': similarity,
148
+ 'metadata': {
149
+ 'chunk_id': chunk['chunk_id'],
150
+ 'start_char': chunk.get('start_char', 0),
151
+ 'end_char': chunk.get('end_char', 0)
152
+ }
153
+ })
154
+
155
+ print(f"βœ… Retrieved {len(results)} relevant chunks")
156
+ return results
{src/pdf-version β†’ customization}/generate_embeddings.py RENAMED
@@ -1,18 +1,12 @@
1
  #!/usr/bin/env python3
2
  """
3
- Quick script to generate new embeddings with sentence-based chunking
4
  """
5
 
6
- import sys
7
- from pathlib import Path
8
-
9
- # Add pdf-version directory to Python path
10
- sys.path.insert(0, str(Path(__file__).parent))
11
-
12
- from demos.demo_runner import build_medical_rag_system
13
 
14
  def main():
15
- print("πŸš€ Starting to build medical RAG system with new sentence-based chunking...")
16
  print("πŸ“‹ Configuration:")
17
  print(" - Chunk size: 256 tokens")
18
  print(" - Chunk overlap: 25 tokens (10%)")
@@ -22,17 +16,19 @@ def main():
22
  print("")
23
 
24
  try:
25
- result = build_medical_rag_system(enable_chunk_embeddings=True)
26
 
27
- if result[0] is not None:
28
- print("βœ… Successfully built medical RAG system!")
29
- print("πŸ“ Generated files:")
30
- print(" - document_index.json")
31
- print(" - tag_embeddings.json")
32
- print(" - document_tag_mapping.json")
33
- print(" - chunk_embeddings.json")
 
 
34
  else:
35
- print("❌ Failed to build system")
36
 
37
  except KeyboardInterrupt:
38
  print("\n⚠️ Process interrupted by user")
 
1
  #!/usr/bin/env python3
2
  """
3
+ Generate embeddings for hospital-specific documents
4
  """
5
 
6
+ from customization_pipeline import build_customization_embeddings
 
 
 
 
 
 
7
 
8
  def main():
9
+ print("πŸš€ Starting to build hospital-specific embeddings...")
10
  print("πŸ“‹ Configuration:")
11
  print(" - Chunk size: 256 tokens")
12
  print(" - Chunk overlap: 25 tokens (10%)")
 
16
  print("")
17
 
18
  try:
19
+ success = build_customization_embeddings()
20
 
21
+ if success:
22
+ print("\nβœ… Successfully built embeddings!")
23
+ print("πŸ“ Generated files in processing folder:")
24
+ print(" - embeddings/document_index.json")
25
+ print(" - embeddings/tag_embeddings.json")
26
+ print(" - embeddings/document_tag_mapping.json")
27
+ print(" - embeddings/chunk_embeddings.json")
28
+ print(" - indices/annoy_metadata.json")
29
+ print(" - indices/*.ann files")
30
  else:
31
+ print("\n❌ Failed to build embeddings")
32
 
33
  except KeyboardInterrupt:
34
  print("\n⚠️ Process interrupted by user")
{src/pdf-version β†’ customization/src}/__init__.py RENAMED
File without changes
customization/src/data/__init__.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Data loading and PDF processing."""
2
+
3
+ from .loaders import load_annotations, filter_pdf_files
4
+
5
+ # Try to import PDF processing functions, but handle missing dependencies gracefully
6
+ try:
7
+ from .pdf_processing import (
8
+ extract_pdf_text,
9
+ extract_tables_from_pdf,
10
+ extract_images_ocr_from_pdf,
11
+ extract_pdf_content_enhanced
12
+ )
13
+ PDF_PROCESSING_AVAILABLE = True
14
+ __all__ = [
15
+ 'load_annotations', 'filter_pdf_files',
16
+ 'extract_pdf_text', 'extract_tables_from_pdf',
17
+ 'extract_images_ocr_from_pdf', 'extract_pdf_content_enhanced'
18
+ ]
19
+ except ImportError as e:
20
+ print(f"⚠️ PDF processing not available: {e}")
21
+ print("πŸ“ Only working with existing embeddings")
22
+ PDF_PROCESSING_AVAILABLE = False
23
+ __all__ = ['load_annotations', 'filter_pdf_files']
{src/pdf-version β†’ customization/src}/data/loaders.py RENAMED
File without changes
{src/pdf-version β†’ customization/src}/data/pdf_processing.py RENAMED
File without changes
{src/pdf-version β†’ customization/src}/demos/__init__.py RENAMED
File without changes
{src/pdf-version β†’ customization/src}/demos/demo_runner.py RENAMED
@@ -6,9 +6,15 @@ from models.embedding_models import load_biomedbert_model
6
  from data.loaders import load_annotations
7
  from indexing.document_indexer import build_document_index
8
  from indexing.embedding_creator import create_tag_embeddings, create_chunk_embeddings
9
- from indexing.storage import save_document_system, load_document_system
10
- from retrieval.document_retriever import create_document_tag_mapping, find_relevant_documents
11
- from retrieval.chunk_retriever import find_relevant_chunks, get_documents_for_rag, get_chunks_for_rag
 
 
 
 
 
 
12
 
13
 
14
  def build_medical_rag_system(enable_chunk_embeddings: bool = True):
@@ -135,4 +141,151 @@ def demo_all_strategies(query: str = "chest pain and shortness of breath"):
135
  for strategy, docs in results.items():
136
  print(f"{strategy:>10}: {len(docs)} documents selected")
137
 
138
- return results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  from data.loaders import load_annotations
7
  from indexing.document_indexer import build_document_index
8
  from indexing.embedding_creator import create_tag_embeddings, create_chunk_embeddings
9
+ from indexing.storage import save_document_system, load_document_system, load_document_system_with_annoy
10
+ from retrieval.document_retriever import (
11
+ create_document_tag_mapping, find_relevant_documents,
12
+ find_relevant_documents_with_fallback
13
+ )
14
+ from retrieval.chunk_retriever import (
15
+ find_relevant_chunks, get_documents_for_rag, get_chunks_for_rag,
16
+ find_relevant_chunks_with_fallback
17
+ )
18
 
19
 
20
  def build_medical_rag_system(enable_chunk_embeddings: bool = True):
 
141
  for strategy, docs in results.items():
142
  print(f"{strategy:>10}: {len(docs)} documents selected")
143
 
144
+ return results
145
+
146
+
147
+ def demo_rag_query_with_annoy(query: str = "chest pain and shortness of breath",
148
+ strategy: str = "top_p", use_chunks: bool = True, **kwargs):
149
+ """Demo RAG query functionality with ANNOY acceleration."""
150
+ print(f"\nπŸš€ Demo ANNOY Query: '{query}' (Strategy: {strategy}, Use chunks: {use_chunks})")
151
+ print("=" * 80)
152
+
153
+ # Try to load existing system with ANNOY
154
+ document_index, tag_embeddings, doc_tag_mapping, chunk_embeddings, annoy_manager = load_document_system_with_annoy()
155
+
156
+ if document_index is None:
157
+ print("πŸ“¦ No saved system found, building new one...")
158
+ build_result = build_medical_rag_system(enable_chunk_embeddings=use_chunks)
159
+ if build_result[0] is None:
160
+ return
161
+ embedding_model, document_index, tag_embeddings, doc_tag_mapping, chunk_embeddings = build_result
162
+
163
+ # Try to load ANNOY manager after building
164
+ from indexing.storage import load_annoy_manager
165
+ annoy_manager = load_annoy_manager()
166
+ else:
167
+ embedding_model = load_biomedbert_model()
168
+
169
+ print(f"πŸ”§ ANNOY Status: {'Available' if annoy_manager else 'Not available (using fallback)'}")
170
+
171
+ # Find relevant documents using ANNOY-accelerated method with fallback
172
+ print(f"\nπŸ” Finding relevant documents...")
173
+ import time
174
+ start_time = time.time()
175
+
176
+ relevant_docs = find_relevant_documents_with_fallback(
177
+ query, embedding_model, tag_embeddings, doc_tag_mapping,
178
+ annoy_manager=annoy_manager, strategy=strategy, **kwargs
179
+ )
180
+
181
+ doc_search_time = time.time() - start_time
182
+ print(f"⏱️ Document search completed in {doc_search_time:.4f}s")
183
+
184
+ if use_chunks and chunk_embeddings:
185
+ # Find relevant chunks using ANNOY-accelerated method with fallback
186
+ print(f"\nπŸ” Finding relevant chunks within selected documents...")
187
+ start_time = time.time()
188
+
189
+ relevant_chunks = find_relevant_chunks_with_fallback(
190
+ query, embedding_model, relevant_docs, chunk_embeddings,
191
+ annoy_manager=annoy_manager, strategy=strategy,
192
+ top_chunks_per_doc=3, **kwargs
193
+ )
194
+
195
+ chunk_search_time = time.time() - start_time
196
+ print(f"⏱️ Chunk search completed in {chunk_search_time:.4f}s")
197
+
198
+ # Get chunks for RAG
199
+ rag_content = get_chunks_for_rag(relevant_chunks, max_chunks=10)
200
+ print(f"\nπŸ“‹ Ready for RAG with {len(rag_content)} chunks")
201
+
202
+ total_time = doc_search_time + chunk_search_time
203
+ print(f"🏁 Total search time: {total_time:.4f}s")
204
+
205
+ else:
206
+ # Get full documents for RAG
207
+ rag_content = get_documents_for_rag(relevant_docs, document_index)
208
+ print(f"\nπŸ“‹ Ready for RAG with {len(rag_content)} full documents")
209
+ print(f"🏁 Total search time: {doc_search_time:.4f}s")
210
+
211
+ return rag_content
212
+
213
+
214
+ def demo_performance_comparison(query: str = "chest pain and shortness of breath"):
215
+ """Demo performance comparison between original and ANNOY methods."""
216
+ print(f"\n⚑ Performance Comparison Demo")
217
+ print("=" * 80)
218
+ print(f"Query: '{query}'")
219
+
220
+ # Load system with ANNOY
221
+ document_index, tag_embeddings, doc_tag_mapping, chunk_embeddings, annoy_manager = load_document_system_with_annoy()
222
+
223
+ if document_index is None:
224
+ print("❌ No saved system found")
225
+ return
226
+
227
+ embedding_model = load_biomedbert_model()
228
+ strategy = "top_p"
229
+ strategy_params = {"top_p": 0.8, "min_similarity": 0.3}
230
+
231
+ print(f"\nπŸ“Š Testing document retrieval performance...")
232
+
233
+ # Test original method
234
+ import time
235
+ start_time = time.time()
236
+ original_docs = find_relevant_documents(
237
+ query, embedding_model, tag_embeddings, doc_tag_mapping,
238
+ strategy=strategy, **strategy_params
239
+ )
240
+ original_time = time.time() - start_time
241
+
242
+ # Test ANNOY method (with fallback)
243
+ start_time = time.time()
244
+ annoy_docs = find_relevant_documents_with_fallback(
245
+ query, embedding_model, tag_embeddings, doc_tag_mapping,
246
+ annoy_manager=annoy_manager, strategy=strategy, **strategy_params
247
+ )
248
+ annoy_time = time.time() - start_time
249
+
250
+ # Results
251
+ print(f"πŸ” Original method: {len(original_docs)} docs in {original_time:.4f}s")
252
+ print(f"πŸš€ ANNOY method: {len(annoy_docs)} docs in {annoy_time:.4f}s")
253
+
254
+ if annoy_time > 0:
255
+ speedup = original_time / annoy_time
256
+ print(f"⚑ Speedup: {speedup:.2f}x")
257
+
258
+ # Check result similarity
259
+ if original_docs and annoy_docs:
260
+ overlap = set(original_docs) & set(annoy_docs)
261
+ print(f"πŸ“Š Result overlap: {len(overlap)}/{len(original_docs)} documents")
262
+
263
+ # Test chunk retrieval if available
264
+ if chunk_embeddings and len(original_docs) > 0:
265
+ print(f"\nπŸ“Š Testing chunk retrieval performance...")
266
+ relevant_docs = original_docs[:2] # Test with first 2 documents
267
+
268
+ # Original method
269
+ start_time = time.time()
270
+ original_chunks = find_relevant_chunks(
271
+ query, embedding_model, relevant_docs, chunk_embeddings,
272
+ strategy=strategy, **strategy_params
273
+ )
274
+ original_chunk_time = time.time() - start_time
275
+
276
+ # ANNOY method (with fallback)
277
+ start_time = time.time()
278
+ annoy_chunks = find_relevant_chunks_with_fallback(
279
+ query, embedding_model, relevant_docs, chunk_embeddings,
280
+ annoy_manager=annoy_manager, strategy=strategy, **strategy_params
281
+ )
282
+ annoy_chunk_time = time.time() - start_time
283
+
284
+ print(f"πŸ” Original chunks: {len(original_chunks)} chunks in {original_chunk_time:.4f}s")
285
+ print(f"πŸš€ ANNOY chunks: {len(annoy_chunks)} chunks in {annoy_chunk_time:.4f}s")
286
+
287
+ if annoy_chunk_time > 0:
288
+ chunk_speedup = original_chunk_time / annoy_chunk_time
289
+ print(f"⚑ Chunk speedup: {chunk_speedup:.2f}x")
290
+
291
+ print(f"\nβœ… Performance comparison completed!")
{src/pdf-version β†’ customization/src}/indexing/__init__.py RENAMED
File without changes
customization/src/indexing/annoy_manager.py ADDED
@@ -0,0 +1,392 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """ANNOY index management for PDF-based RAG system."""
2
+
3
+ import os
4
+ import json
5
+ import numpy as np
6
+ from typing import Dict, List, Optional, Tuple, Union
7
+ from pathlib import Path
8
+ import logging
9
+
10
+ try:
11
+ from annoy import AnnoyIndex
12
+ except ImportError:
13
+ raise ImportError("annoy package is required. Install with: pip install annoy")
14
+
15
+ # Configure logging
16
+ logging.basicConfig(level=logging.INFO)
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ class AnnoyIndexManager:
21
+ """Manages ANNOY indices for fast vector similarity search."""
22
+
23
+ def __init__(self, embedding_dim: int = 1024, metric: str = 'angular'):
24
+ """
25
+ Initialize ANNOY index manager.
26
+
27
+ Args:
28
+ embedding_dim: Dimension of embeddings (1024 for BGE Large Medical)
29
+ metric: Distance metric ('angular' for cosine similarity, 'euclidean', 'manhattan', 'hamming', 'dot')
30
+ """
31
+ self.embedding_dim = embedding_dim
32
+ self.metric = metric
33
+ self.tag_index = None
34
+ self.chunk_index = None
35
+ self.tag_to_id_mapping = {}
36
+ self.id_to_tag_mapping = {}
37
+ self.chunk_to_id_mapping = {}
38
+ self.id_to_chunk_mapping = {}
39
+
40
+ logger.info(f"Initialized AnnoyIndexManager: dim={embedding_dim}, metric={metric}")
41
+
42
+ def build_tag_index(self, tag_embeddings: Dict[str, np.ndarray], n_trees: int = 50) -> AnnoyIndex:
43
+ """
44
+ Build ANNOY index for tag embeddings.
45
+
46
+ Args:
47
+ tag_embeddings: Dictionary mapping tags to their embeddings
48
+ n_trees: Number of trees (more trees = better precision, slower build)
49
+
50
+ Returns:
51
+ Built ANNOY index
52
+ """
53
+ logger.info(f"Building tag ANNOY index with {len(tag_embeddings)} tags...")
54
+
55
+ # Create index
56
+ self.tag_index = AnnoyIndex(self.embedding_dim, self.metric)
57
+
58
+ # Create mappings
59
+ self.tag_to_id_mapping = {}
60
+ self.id_to_tag_mapping = {}
61
+
62
+ # Add embeddings to index
63
+ for tag_id, (tag, embedding) in enumerate(tag_embeddings.items()):
64
+ self.tag_index.add_item(tag_id, embedding)
65
+ self.tag_to_id_mapping[tag] = tag_id
66
+ self.id_to_tag_mapping[tag_id] = tag
67
+
68
+ # Build index
69
+ logger.info(f"Building index with {n_trees} trees...")
70
+ self.tag_index.build(n_trees)
71
+
72
+ logger.info(f"βœ… Tag ANNOY index built successfully: {len(tag_embeddings)} tags")
73
+ return self.tag_index
74
+
75
+ def build_chunk_index(self, chunk_embeddings: Dict[str, List[Dict]], n_trees: int = 50) -> AnnoyIndex:
76
+ """
77
+ Build ANNOY index for chunk embeddings.
78
+
79
+ Args:
80
+ chunk_embeddings: Dictionary mapping document names to lists of chunk dictionaries
81
+ n_trees: Number of trees
82
+
83
+ Returns:
84
+ Built ANNOY index
85
+ """
86
+ # Count total chunks
87
+ total_chunks = sum(len(chunks) for chunks in chunk_embeddings.values())
88
+ logger.info(f"Building chunk ANNOY index with {total_chunks} chunks...")
89
+
90
+ # Create index
91
+ self.chunk_index = AnnoyIndex(self.embedding_dim, self.metric)
92
+
93
+ # Create mappings
94
+ self.chunk_to_id_mapping = {}
95
+ self.id_to_chunk_mapping = {}
96
+
97
+ chunk_id = 0
98
+ for doc_name, chunks in chunk_embeddings.items():
99
+ for chunk in chunks:
100
+ # Create unique chunk identifier
101
+ chunk_key = f"{doc_name}#{chunk['chunk_id']}"
102
+
103
+ # Add to index
104
+ self.chunk_index.add_item(chunk_id, chunk['embedding'])
105
+
106
+ # Create mappings
107
+ self.chunk_to_id_mapping[chunk_key] = chunk_id
108
+ self.id_to_chunk_mapping[chunk_id] = {
109
+ 'document': doc_name,
110
+ 'chunk_id': chunk['chunk_id'],
111
+ 'text': chunk['text'],
112
+ 'start_char': chunk.get('start_char', 0),
113
+ 'end_char': chunk.get('end_char', len(chunk['text'])),
114
+ 'token_count': chunk.get('token_count', len(chunk['text'].split())),
115
+ 'chunk_key': chunk_key
116
+ }
117
+
118
+ chunk_id += 1
119
+
120
+ # Build index
121
+ logger.info(f"Building chunk index with {n_trees} trees...")
122
+ self.chunk_index.build(n_trees)
123
+
124
+ logger.info(f"βœ… Chunk ANNOY index built successfully: {total_chunks} chunks")
125
+ return self.chunk_index
126
+
127
+ def save_indices(self, output_dir: Union[str, Path]):
128
+ """
129
+ Save ANNOY indices and mappings to disk.
130
+
131
+ Args:
132
+ output_dir: Directory to save indices
133
+ """
134
+ output_dir = Path(output_dir)
135
+ # Save indices at the same level as embeddings, not inside embeddings
136
+ indices_dir = output_dir.parent / 'indices'
137
+ indices_dir.mkdir(exist_ok=True)
138
+
139
+ # Save tag index
140
+ if self.tag_index is not None:
141
+ tag_index_path = indices_dir / 'tag_embeddings.ann'
142
+ self.tag_index.save(str(tag_index_path))
143
+
144
+ # Save tag mappings
145
+ tag_mappings_path = indices_dir / 'tag_mappings.json'
146
+ with open(tag_mappings_path, 'w', encoding='utf-8') as f:
147
+ json.dump({
148
+ 'tag_to_id': self.tag_to_id_mapping,
149
+ 'id_to_tag': self.id_to_tag_mapping
150
+ }, f, indent=2, ensure_ascii=False)
151
+
152
+ logger.info(f"βœ… Tag index saved: {tag_index_path}")
153
+
154
+ # Save chunk index
155
+ if self.chunk_index is not None:
156
+ chunk_index_path = indices_dir / 'chunk_embeddings.ann'
157
+ self.chunk_index.save(str(chunk_index_path))
158
+
159
+ # Save chunk mappings
160
+ chunk_mappings_path = indices_dir / 'chunk_mappings.json'
161
+ with open(chunk_mappings_path, 'w', encoding='utf-8') as f:
162
+ json.dump({
163
+ 'chunk_to_id': self.chunk_to_id_mapping,
164
+ 'id_to_chunk': self.id_to_chunk_mapping
165
+ }, f, indent=2, ensure_ascii=False)
166
+
167
+ logger.info(f"βœ… Chunk index saved: {chunk_index_path}")
168
+
169
+ # Save index metadata
170
+ metadata_path = indices_dir / 'annoy_metadata.json'
171
+ with open(metadata_path, 'w', encoding='utf-8') as f:
172
+ json.dump({
173
+ 'embedding_dim': self.embedding_dim,
174
+ 'metric': self.metric,
175
+ 'tag_index_exists': self.tag_index is not None,
176
+ 'chunk_index_exists': self.chunk_index is not None,
177
+ 'num_tags': len(self.tag_to_id_mapping),
178
+ 'num_chunks': len(self.chunk_to_id_mapping)
179
+ }, f, indent=2)
180
+
181
+ logger.info(f"βœ… ANNOY indices saved to: {indices_dir}")
182
+
183
+ def load_indices(self, input_dir: Union[str, Path]) -> bool:
184
+ """
185
+ Load ANNOY indices and mappings from disk.
186
+
187
+ Args:
188
+ input_dir: Directory containing saved indices
189
+
190
+ Returns:
191
+ True if successfully loaded, False otherwise
192
+ """
193
+ input_dir = Path(input_dir)
194
+ # Load indices from the same level as embeddings, not inside embeddings
195
+ indices_dir = input_dir.parent / 'indices'
196
+
197
+ if not indices_dir.exists():
198
+ logger.warning(f"Indices directory not found: {indices_dir}")
199
+ return False
200
+
201
+ try:
202
+ # Load metadata
203
+ metadata_path = indices_dir / 'annoy_metadata.json'
204
+ if metadata_path.exists():
205
+ with open(metadata_path, 'r', encoding='utf-8') as f:
206
+ metadata = json.load(f)
207
+ self.embedding_dim = metadata['embedding_dim']
208
+ self.metric = metadata['metric']
209
+ logger.info(f"Loaded metadata: dim={self.embedding_dim}, metric={self.metric}")
210
+
211
+ # Load tag index
212
+ tag_index_path = indices_dir / 'tag_embeddings.ann'
213
+ tag_mappings_path = indices_dir / 'tag_mappings.json'
214
+
215
+ if tag_index_path.exists() and tag_mappings_path.exists():
216
+ self.tag_index = AnnoyIndex(self.embedding_dim, self.metric)
217
+ self.tag_index.load(str(tag_index_path))
218
+
219
+ with open(tag_mappings_path, 'r', encoding='utf-8') as f:
220
+ mappings = json.load(f)
221
+ self.tag_to_id_mapping = mappings['tag_to_id']
222
+ self.id_to_tag_mapping = {int(k): v for k, v in mappings['id_to_tag'].items()}
223
+
224
+ logger.info(f"βœ… Tag index loaded: {len(self.tag_to_id_mapping)} tags")
225
+
226
+ # Load chunk index
227
+ chunk_index_path = indices_dir / 'chunk_embeddings.ann'
228
+ chunk_mappings_path = indices_dir / 'chunk_mappings.json'
229
+
230
+ if chunk_index_path.exists() and chunk_mappings_path.exists():
231
+ self.chunk_index = AnnoyIndex(self.embedding_dim, self.metric)
232
+ self.chunk_index.load(str(chunk_index_path))
233
+
234
+ with open(chunk_mappings_path, 'r', encoding='utf-8') as f:
235
+ mappings = json.load(f)
236
+ self.chunk_to_id_mapping = mappings['chunk_to_id']
237
+ self.id_to_chunk_mapping = {int(k): v for k, v in mappings['id_to_chunk'].items()}
238
+
239
+ logger.info(f"βœ… Chunk index loaded: {len(self.chunk_to_id_mapping)} chunks")
240
+
241
+ return True
242
+
243
+ except Exception as e:
244
+ logger.error(f"Failed to load ANNOY indices: {e}")
245
+ return False
246
+
247
+ def search_tags(self, query_embedding: np.ndarray, n_neighbors: int = 10,
248
+ include_distances: bool = True) -> Union[List[str], Tuple[List[str], List[float]]]:
249
+ """
250
+ Search for similar tags using ANNOY index.
251
+
252
+ Args:
253
+ query_embedding: Query embedding vector
254
+ n_neighbors: Number of nearest neighbors to return
255
+ include_distances: Whether to return distances
256
+
257
+ Returns:
258
+ List of tag names, or tuple of (tag_names, distances)
259
+ """
260
+ if self.tag_index is None:
261
+ raise ValueError("Tag index not built or loaded")
262
+
263
+ # Search using ANNOY
264
+ if include_distances:
265
+ neighbor_ids, distances = self.tag_index.get_nns_by_vector(
266
+ query_embedding, n_neighbors, include_distances=True
267
+ )
268
+ else:
269
+ neighbor_ids = self.tag_index.get_nns_by_vector(
270
+ query_embedding, n_neighbors, include_distances=False
271
+ )
272
+
273
+ # Convert IDs to tag names
274
+ tag_names = [self.id_to_tag_mapping[neighbor_id] for neighbor_id in neighbor_ids]
275
+
276
+ if include_distances:
277
+ return tag_names, distances
278
+ else:
279
+ return tag_names
280
+
281
+ def search_chunks(self, query_embedding: np.ndarray, n_neighbors: int = 10,
282
+ include_distances: bool = True) -> Union[List[Dict], Tuple[List[Dict], List[float]]]:
283
+ """
284
+ Search for similar chunks using ANNOY index.
285
+
286
+ Args:
287
+ query_embedding: Query embedding vector
288
+ n_neighbors: Number of nearest neighbors to return
289
+ include_distances: Whether to return distances
290
+
291
+ Returns:
292
+ List of chunk dictionaries, or tuple of (chunks, distances)
293
+ """
294
+ if self.chunk_index is None:
295
+ raise ValueError("Chunk index not built or loaded")
296
+
297
+ # Search using ANNOY
298
+ if include_distances:
299
+ neighbor_ids, distances = self.chunk_index.get_nns_by_vector(
300
+ query_embedding, n_neighbors, include_distances=True
301
+ )
302
+ else:
303
+ neighbor_ids = self.chunk_index.get_nns_by_vector(
304
+ query_embedding, n_neighbors, include_distances=False
305
+ )
306
+
307
+ # Convert IDs to chunk info
308
+ chunks = [self.id_to_chunk_mapping[neighbor_id] for neighbor_id in neighbor_ids]
309
+
310
+ if include_distances:
311
+ return chunks, distances
312
+ else:
313
+ return chunks
314
+
315
+ def search_chunks_in_documents(self, query_embedding: np.ndarray,
316
+ document_names: List[str], n_neighbors: int = 10,
317
+ include_distances: bool = True) -> Union[List[Dict], Tuple[List[Dict], List[float]]]:
318
+ """
319
+ Search for similar chunks within specific documents.
320
+
321
+ Args:
322
+ query_embedding: Query embedding vector
323
+ document_names: List of document names to search within
324
+ n_neighbors: Number of nearest neighbors to return
325
+ include_distances: Whether to return distances
326
+
327
+ Returns:
328
+ List of chunk dictionaries, or tuple of (chunks, distances)
329
+ """
330
+ if self.chunk_index is None:
331
+ raise ValueError("Chunk index not built or loaded")
332
+
333
+ # Get more candidates than needed since we'll filter by document
334
+ search_candidates = min(n_neighbors * 5, len(self.id_to_chunk_mapping))
335
+
336
+ # Search using ANNOY
337
+ if include_distances:
338
+ candidate_ids, distances = self.chunk_index.get_nns_by_vector(
339
+ query_embedding, search_candidates, include_distances=True
340
+ )
341
+ else:
342
+ candidate_ids = self.chunk_index.get_nns_by_vector(
343
+ query_embedding, search_candidates, include_distances=False
344
+ )
345
+
346
+ # Filter by document names and take top n_neighbors
347
+ filtered_chunks = []
348
+ filtered_distances = [] if include_distances else None
349
+
350
+ for i, candidate_id in enumerate(candidate_ids):
351
+ chunk_info = self.id_to_chunk_mapping[candidate_id]
352
+ if chunk_info['document'] in document_names:
353
+ filtered_chunks.append(chunk_info)
354
+ if include_distances:
355
+ filtered_distances.append(distances[i])
356
+
357
+ if len(filtered_chunks) >= n_neighbors:
358
+ break
359
+
360
+ if include_distances:
361
+ return filtered_chunks, filtered_distances
362
+ else:
363
+ return filtered_chunks
364
+
365
+ def get_index_stats(self) -> Dict:
366
+ """Get statistics about the loaded indices."""
367
+ stats = {
368
+ 'embedding_dim': self.embedding_dim,
369
+ 'metric': self.metric,
370
+ 'tag_index_loaded': self.tag_index is not None,
371
+ 'chunk_index_loaded': self.chunk_index is not None,
372
+ 'num_tags': len(self.tag_to_id_mapping) if self.tag_index else 0,
373
+ 'num_chunks': len(self.chunk_to_id_mapping) if self.chunk_index else 0
374
+ }
375
+ return stats
376
+
377
+
378
+ def convert_angular_distance_to_cosine_similarity(angular_distance: float) -> float:
379
+ """
380
+ Convert ANNOY angular distance to cosine similarity.
381
+
382
+ Args:
383
+ angular_distance: Angular distance from ANNOY
384
+
385
+ Returns:
386
+ Cosine similarity (0 to 1)
387
+ """
388
+ # Angular distance is related to cosine similarity by:
389
+ # angular_distance = 2 * arccos(cosine_similarity) / Ο€
390
+ # Therefore: cosine_similarity = cos(angular_distance * Ο€ / 2)
391
+ import math
392
+ return math.cos(angular_distance * math.pi / 2)
{src/pdf-version β†’ customization/src}/indexing/document_indexer.py RENAMED
File without changes
{src/pdf-version β†’ customization/src}/indexing/embedding_creator.py RENAMED
File without changes
{src/pdf-version β†’ customization/src}/indexing/storage.py RENAMED
@@ -2,13 +2,19 @@
2
 
3
  import json
4
  import os
 
5
  from typing import Dict, Optional, Tuple
6
  import numpy as np
 
 
 
 
 
7
 
8
 
9
  def save_document_system(document_index: Dict, tag_embeddings: Dict,
10
  doc_tag_mapping: Dict, chunk_embeddings: Dict = None,
11
- output_dir: str = None):
12
  """Save the complete document indexing system.
13
 
14
  Args:
@@ -85,6 +91,31 @@ def save_document_system(document_index: Dict, tag_embeddings: Dict,
85
  with open(os.path.join(output_dir, 'chunk_embeddings.json'), 'w', encoding='utf-8') as f:
86
  json.dump(chunk_embeddings_serializable, f, indent=2, ensure_ascii=False)
87
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
  print("βœ… Document system saved to files")
89
 
90
 
@@ -161,4 +192,62 @@ def load_document_system(input_dir: str = None) -> Tuple[Optional[Dict], Optiona
161
 
162
  except Exception as e:
163
  print(f"❌ Failed to load document system: {e}")
164
- return None, None, None, None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
  import json
4
  import os
5
+ import logging
6
  from typing import Dict, Optional, Tuple
7
  import numpy as np
8
+ from .annoy_manager import AnnoyIndexManager
9
+
10
+ # Configure logging
11
+ logging.basicConfig(level=logging.INFO)
12
+ logger = logging.getLogger(__name__)
13
 
14
 
15
  def save_document_system(document_index: Dict, tag_embeddings: Dict,
16
  doc_tag_mapping: Dict, chunk_embeddings: Dict = None,
17
+ output_dir: str = None, build_annoy_indices: bool = True):
18
  """Save the complete document indexing system.
19
 
20
  Args:
 
91
  with open(os.path.join(output_dir, 'chunk_embeddings.json'), 'w', encoding='utf-8') as f:
92
  json.dump(chunk_embeddings_serializable, f, indent=2, ensure_ascii=False)
93
 
94
+ # Build and save ANNOY indices if requested
95
+ if build_annoy_indices:
96
+ logger.info("πŸ”§ Building ANNOY indices for fast retrieval...")
97
+ try:
98
+ # Initialize ANNOY manager (assuming BGE Large Medical embedding dimension)
99
+ annoy_manager = AnnoyIndexManager(embedding_dim=1024, metric='angular')
100
+
101
+ # Build tag index
102
+ logger.info("Building tag ANNOY index...")
103
+ annoy_manager.build_tag_index(tag_embeddings, n_trees=50)
104
+
105
+ # Build chunk index if chunk embeddings are provided
106
+ if chunk_embeddings:
107
+ logger.info("Building chunk ANNOY index...")
108
+ annoy_manager.build_chunk_index(chunk_embeddings, n_trees=50)
109
+
110
+ # Save indices
111
+ logger.info("Saving ANNOY indices...")
112
+ annoy_manager.save_indices(output_dir)
113
+
114
+ logger.info("βœ… ANNOY indices built and saved successfully")
115
+ except Exception as e:
116
+ logger.error(f"❌ Failed to build ANNOY indices: {e}")
117
+ logger.warning("Continuing without ANNOY indices - will use original search methods")
118
+
119
  print("βœ… Document system saved to files")
120
 
121
 
 
192
 
193
  except Exception as e:
194
  print(f"❌ Failed to load document system: {e}")
195
+ return None, None, None, None
196
+
197
+
198
+ def load_annoy_manager(input_dir: str = None) -> Optional[AnnoyIndexManager]:
199
+ """
200
+ Load ANNOY index manager with pre-built indices.
201
+
202
+ Args:
203
+ input_dir: Input directory containing saved indices
204
+
205
+ Returns:
206
+ AnnoyIndexManager instance or None if loading fails
207
+ """
208
+ if input_dir is None:
209
+ # Get project root directory
210
+ from pathlib import Path
211
+ root_dir = Path(__file__).parent.parent.parent.parent
212
+ input_dir = root_dir / 'embeddings' / 'pdfembeddings'
213
+
214
+ try:
215
+ # Initialize ANNOY manager
216
+ annoy_manager = AnnoyIndexManager(embedding_dim=1024, metric='angular')
217
+
218
+ # Try to load indices
219
+ if annoy_manager.load_indices(input_dir):
220
+ logger.info("βœ… ANNOY indices loaded successfully")
221
+ return annoy_manager
222
+ else:
223
+ logger.warning("⚠️ Failed to load ANNOY indices")
224
+ return None
225
+
226
+ except Exception as e:
227
+ logger.error(f"❌ Failed to initialize ANNOY manager: {e}")
228
+ return None
229
+
230
+
231
+ def load_document_system_with_annoy(input_dir: str = None, annoy_dir: str = None) -> Tuple[Optional[Dict], Optional[Dict], Optional[Dict], Optional[Dict], Optional[AnnoyIndexManager]]:
232
+ """
233
+ Load the complete document indexing system including ANNOY indices.
234
+
235
+ Args:
236
+ input_dir: Input directory containing saved files
237
+ annoy_dir: Directory containing ANNOY indices (if different from input_dir)
238
+
239
+ Returns:
240
+ Tuple of (document_index, tag_embeddings, doc_tag_mapping, chunk_embeddings, annoy_manager).
241
+ Returns all None values if loading fails.
242
+ """
243
+ # Load the standard document system
244
+ document_index, tag_embeddings, doc_tag_mapping, chunk_embeddings = load_document_system(input_dir)
245
+
246
+ if document_index is None:
247
+ return None, None, None, None, None
248
+
249
+ # Load ANNOY manager
250
+ # Use annoy_dir if provided, otherwise use input_dir
251
+ annoy_manager = load_annoy_manager(annoy_dir if annoy_dir else input_dir)
252
+
253
+ return document_index, tag_embeddings, doc_tag_mapping, chunk_embeddings, annoy_manager
{src/pdf-version β†’ customization/src}/models/__init__.py RENAMED
File without changes
{src/pdf-version β†’ customization/src}/models/embedding_models.py RENAMED
File without changes
{src/pdf-version β†’ customization/src}/rag/__init__.py RENAMED
File without changes
{src/pdf-version β†’ customization/src}/rag/medical_rag_pipeline.py RENAMED
File without changes
{src/pdf-version β†’ customization/src}/retrieval/__init__.py RENAMED
File without changes
{src/pdf-version β†’ customization/src}/retrieval/chunk_retriever.py RENAMED
@@ -1,9 +1,15 @@
1
  """Chunk-level retrieval functionality."""
2
 
3
- from typing import List, Dict, Callable
4
  import numpy as np
 
5
  from sentence_transformers import SentenceTransformer
6
- from src.indexing.embedding_creator import create_text_embedding
 
 
 
 
 
7
 
8
 
9
  def cosine_similarity(vec1: np.ndarray, vec2: np.ndarray) -> float:
@@ -190,4 +196,172 @@ def get_chunks_for_rag(relevant_chunks: List[Dict], max_chunks: int = 10) -> Lis
190
  rag_chunks.append(formatted_chunk)
191
 
192
  print(f"πŸ“„ Retrieved {len(rag_chunks)} chunks for RAG")
193
- return rag_chunks
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  """Chunk-level retrieval functionality."""
2
 
3
+ from typing import List, Dict, Callable, Optional
4
  import numpy as np
5
+ import logging
6
  from sentence_transformers import SentenceTransformer
7
+ from indexing.embedding_creator import create_text_embedding
8
+ from indexing.annoy_manager import AnnoyIndexManager, convert_angular_distance_to_cosine_similarity
9
+
10
+ # Configure logging
11
+ logging.basicConfig(level=logging.INFO)
12
+ logger = logging.getLogger(__name__)
13
 
14
 
15
  def cosine_similarity(vec1: np.ndarray, vec2: np.ndarray) -> float:
 
196
  rag_chunks.append(formatted_chunk)
197
 
198
  print(f"πŸ“„ Retrieved {len(rag_chunks)} chunks for RAG")
199
+ return rag_chunks
200
+
201
+
202
+ # ANNOY-accelerated chunk retrieval functions
203
+
204
+ def find_relevant_chunks_annoy_top_k(query: str, model: SentenceTransformer,
205
+ relevant_docs: List[str], annoy_manager: AnnoyIndexManager,
206
+ top_chunks_per_doc: int = 3,
207
+ similarity_metric: str = "angular") -> List[Dict]:
208
+ """Find most relevant chunks using ANNOY index and Top-K strategy."""
209
+ query_embedding = create_text_embedding(model, query)
210
+
211
+ # Use ANNOY to search chunks in the relevant documents
212
+ all_chunks, distances = annoy_manager.search_chunks_in_documents(
213
+ query_embedding, relevant_docs,
214
+ n_neighbors=len(relevant_docs) * top_chunks_per_doc,
215
+ include_distances=True
216
+ )
217
+
218
+ # Convert distances to similarities and format results
219
+ all_relevant_chunks = []
220
+ for chunk, distance in zip(all_chunks, distances):
221
+ similarity = convert_angular_distance_to_cosine_similarity(distance)
222
+
223
+ chunk_result = {
224
+ 'document': chunk['document'],
225
+ 'chunk_id': chunk['chunk_id'],
226
+ 'text': chunk['text'],
227
+ 'start_char': chunk.get('start_char', 0),
228
+ 'end_char': chunk.get('end_char', len(chunk['text'])),
229
+ 'token_count': chunk.get('token_count', len(chunk['text'].split())),
230
+ 'similarity': similarity
231
+ }
232
+ all_relevant_chunks.append(chunk_result)
233
+
234
+ # Group by document and take top chunks per document
235
+ doc_chunks = {}
236
+ for chunk in all_relevant_chunks:
237
+ doc_name = chunk['document']
238
+ if doc_name not in doc_chunks:
239
+ doc_chunks[doc_name] = []
240
+ doc_chunks[doc_name].append(chunk)
241
+
242
+ # Take top chunks from each document
243
+ final_chunks = []
244
+ for doc_name in relevant_docs:
245
+ if doc_name in doc_chunks:
246
+ doc_chunks[doc_name].sort(key=lambda x: x['similarity'], reverse=True)
247
+ final_chunks.extend(doc_chunks[doc_name][:top_chunks_per_doc])
248
+
249
+ # Sort all chunks by similarity
250
+ final_chunks.sort(key=lambda x: x['similarity'], reverse=True)
251
+
252
+ logger.info(f"πŸš€ Found {len(final_chunks)} relevant chunks (ANNOY Top-K)")
253
+ for i, chunk in enumerate(final_chunks[:5]): # Show top 5
254
+ logger.info(f" {i+1}. {chunk['document']} (chunk {chunk['chunk_id']}, similarity: {chunk['similarity']:.3f})")
255
+ logger.info(f" Preview: {chunk['text'][:100]}...")
256
+
257
+ return final_chunks
258
+
259
+
260
+ def find_relevant_chunks_annoy_top_p(query: str, model: SentenceTransformer,
261
+ relevant_docs: List[str], annoy_manager: AnnoyIndexManager,
262
+ top_p: float = 0.6, min_similarity: float = 0.3,
263
+ similarity_metric: str = "angular") -> List[Dict]:
264
+ """Find most relevant chunks using ANNOY index and Top-P strategy."""
265
+ query_embedding = create_text_embedding(model, query)
266
+
267
+ # Search more chunks to ensure we have enough candidates for Top-P selection
268
+ search_candidates = min(len(relevant_docs) * 10, 100) # Reasonable upper limit
269
+
270
+ # Use ANNOY to search chunks in the relevant documents
271
+ all_chunks, distances = annoy_manager.search_chunks_in_documents(
272
+ query_embedding, relevant_docs,
273
+ n_neighbors=search_candidates,
274
+ include_distances=True
275
+ )
276
+
277
+ # Convert distances to similarities and filter by minimum similarity
278
+ filtered_chunks = []
279
+ for chunk, distance in zip(all_chunks, distances):
280
+ similarity = convert_angular_distance_to_cosine_similarity(distance)
281
+
282
+ # Only include chunks above minimum similarity threshold
283
+ if similarity >= min_similarity:
284
+ chunk_result = {
285
+ 'document': chunk['document'],
286
+ 'chunk_id': chunk['chunk_id'],
287
+ 'text': chunk['text'],
288
+ 'start_char': chunk.get('start_char', 0),
289
+ 'end_char': chunk.get('end_char', len(chunk['text'])),
290
+ 'token_count': chunk.get('token_count', len(chunk['text'].split())),
291
+ 'similarity': similarity
292
+ }
293
+ filtered_chunks.append(chunk_result)
294
+
295
+ if not filtered_chunks:
296
+ logger.warning(f"⚠️ No chunks found above similarity threshold {min_similarity}")
297
+ return []
298
+
299
+ # Sort by similarity
300
+ filtered_chunks.sort(key=lambda x: x['similarity'], reverse=True)
301
+
302
+ # Apply Top-P selection
303
+ total_score = sum(chunk['similarity'] for chunk in filtered_chunks)
304
+ cumulative_prob = 0.0
305
+ selected_chunks = []
306
+
307
+ for chunk in filtered_chunks:
308
+ prob = chunk['similarity'] / total_score
309
+ cumulative_prob += prob
310
+ selected_chunks.append(chunk)
311
+
312
+ # Stop when we reach the Top-P threshold
313
+ if cumulative_prob >= top_p:
314
+ break
315
+
316
+ logger.info(f"πŸš€ Found {len(selected_chunks)} relevant chunks (ANNOY Top-P={top_p})")
317
+ logger.info(f"πŸ“Š Filtered from {len(filtered_chunks)} chunks above threshold")
318
+ logger.info(f"πŸ“Š Cumulative probability: {cumulative_prob:.3f}")
319
+
320
+ for i, chunk in enumerate(selected_chunks[:5]): # Show top 5
321
+ logger.info(f" {i+1}. {chunk['document']} (chunk {chunk['chunk_id']}, similarity: {chunk['similarity']:.3f})")
322
+ logger.info(f" Preview: {chunk['text'][:100]}...")
323
+
324
+ return selected_chunks
325
+
326
+
327
+ def find_relevant_chunks_annoy(query: str, model: SentenceTransformer,
328
+ relevant_docs: List[str], annoy_manager: AnnoyIndexManager,
329
+ strategy: str = "top_p", **kwargs) -> List[Dict]:
330
+ """Unified interface for ANNOY-accelerated chunk retrieval with different strategies."""
331
+
332
+ similarity_metric = kwargs.get("similarity_metric", "angular")
333
+
334
+ if strategy == "top_k":
335
+ top_chunks_per_doc = kwargs.get("top_chunks_per_doc", 3)
336
+ return find_relevant_chunks_annoy_top_k(query, model, relevant_docs, annoy_manager,
337
+ top_chunks_per_doc, similarity_metric)
338
+
339
+ elif strategy == "top_p":
340
+ top_p = kwargs.get("top_p", 0.6)
341
+ min_similarity = kwargs.get("min_similarity", 0.3)
342
+ return find_relevant_chunks_annoy_top_p(query, model, relevant_docs, annoy_manager,
343
+ top_p, min_similarity, similarity_metric)
344
+
345
+ else:
346
+ raise ValueError(f"Unknown strategy: {strategy}. Use 'top_k' or 'top_p'")
347
+
348
+
349
+ def find_relevant_chunks_with_fallback(query: str, model: SentenceTransformer,
350
+ relevant_docs: List[str], chunk_embeddings: Dict,
351
+ annoy_manager: Optional[AnnoyIndexManager] = None,
352
+ strategy: str = "top_p", **kwargs) -> List[Dict]:
353
+ """
354
+ Find relevant chunks with ANNOY acceleration and fallback to original method.
355
+
356
+ This function automatically uses ANNOY if available, otherwise falls back to original search.
357
+ """
358
+ if annoy_manager is not None:
359
+ try:
360
+ logger.info("πŸš€ Using ANNOY-accelerated chunk retrieval")
361
+ return find_relevant_chunks_annoy(query, model, relevant_docs, annoy_manager, strategy, **kwargs)
362
+ except Exception as e:
363
+ logger.warning(f"⚠️ ANNOY chunk retrieval failed, falling back to original method: {e}")
364
+
365
+ # Fallback to original method
366
+ logger.info("πŸ” Using original chunk retrieval method")
367
+ return find_relevant_chunks(query, model, relevant_docs, chunk_embeddings, strategy, **kwargs)
{src/pdf-version β†’ customization/src}/retrieval/document_retriever.py RENAMED
@@ -1,9 +1,15 @@
1
  """Document retrieval strategies and functionality."""
2
 
3
- from typing import List, Dict
4
  import numpy as np
 
5
  from sentence_transformers import SentenceTransformer
6
- from src.indexing.embedding_creator import create_text_embedding
 
 
 
 
 
7
 
8
 
9
  def find_relevant_documents_top_k(query: str, model: SentenceTransformer,
@@ -189,4 +195,202 @@ def create_document_tag_mapping(document_index: Dict, tag_embeddings: Dict) -> D
189
  'treatments': doc_info.get('treatments', [])
190
  }
191
 
192
- return doc_tag_mapping
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  """Document retrieval strategies and functionality."""
2
 
3
+ from typing import List, Dict, Optional
4
  import numpy as np
5
+ import logging
6
  from sentence_transformers import SentenceTransformer
7
+ from indexing.embedding_creator import create_text_embedding
8
+ from indexing.annoy_manager import AnnoyIndexManager, convert_angular_distance_to_cosine_similarity
9
+
10
+ # Configure logging
11
+ logging.basicConfig(level=logging.INFO)
12
+ logger = logging.getLogger(__name__)
13
 
14
 
15
  def find_relevant_documents_top_k(query: str, model: SentenceTransformer,
 
195
  'treatments': doc_info.get('treatments', [])
196
  }
197
 
198
+ return doc_tag_mapping
199
+
200
+
201
+ # ANNOY-accelerated document retrieval functions
202
+
203
+ def find_relevant_documents_annoy_top_k(query: str, model: SentenceTransformer,
204
+ annoy_manager: AnnoyIndexManager, doc_tag_mapping: Dict,
205
+ top_k: int = 3, search_neighbors: int = 20) -> List[str]:
206
+ """Find top-k most relevant documents using ANNOY index for fast tag search."""
207
+ query_embedding = create_text_embedding(model, query)
208
+
209
+ # Use ANNOY to find similar tags quickly
210
+ similar_tags, distances = annoy_manager.search_tags(
211
+ query_embedding, n_neighbors=search_neighbors, include_distances=True
212
+ )
213
+
214
+ # Convert angular distances to cosine similarities
215
+ tag_similarities = {}
216
+ for tag, distance in zip(similar_tags, distances):
217
+ similarity = convert_angular_distance_to_cosine_similarity(distance)
218
+ tag_similarities[tag] = similarity
219
+
220
+ # Find documents that contain the most similar tags
221
+ doc_scores = {}
222
+ for pdf_name, doc_info in doc_tag_mapping.items():
223
+ doc_tags = doc_info['tags']
224
+
225
+ # Calculate document score using max similarity for precise tag matching
226
+ if doc_tags:
227
+ similarities = [tag_similarities.get(tag, 0) for tag in doc_tags]
228
+ # Use max similarity to find documents with best tag matches
229
+ doc_score = max(similarities)
230
+ doc_scores[pdf_name] = doc_score
231
+
232
+ # Sort and return top-k documents
233
+ sorted_docs = sorted(doc_scores.items(), key=lambda x: x[1], reverse=True)
234
+ relevant_docs = [doc_name for doc_name, score in sorted_docs[:top_k]]
235
+
236
+ logger.info(f"πŸš€ Found {len(relevant_docs)} relevant documents for query: '{query}' (ANNOY TOP-K)")
237
+ for i, doc_name in enumerate(relevant_docs):
238
+ score = doc_scores[doc_name]
239
+ logger.info(f" {i+1}. {doc_name} (similarity: {score:.3f})")
240
+
241
+ return relevant_docs
242
+
243
+
244
+ def find_relevant_documents_annoy_top_p(query: str, model: SentenceTransformer,
245
+ annoy_manager: AnnoyIndexManager, doc_tag_mapping: Dict,
246
+ top_p: float = 0.6, min_similarity: float = 0.5,
247
+ search_neighbors: int = 30) -> List[str]:
248
+ """Find documents using TOP-P (nucleus sampling) approach with ANNOY acceleration."""
249
+ query_embedding = create_text_embedding(model, query)
250
+
251
+ # Use ANNOY to find similar tags quickly
252
+ similar_tags, distances = annoy_manager.search_tags(
253
+ query_embedding, n_neighbors=search_neighbors, include_distances=True
254
+ )
255
+
256
+ # Convert angular distances to cosine similarities
257
+ tag_similarities = {}
258
+ for tag, distance in zip(similar_tags, distances):
259
+ similarity = convert_angular_distance_to_cosine_similarity(distance)
260
+ tag_similarities[tag] = similarity
261
+
262
+ # Find documents that contain the most similar tags
263
+ doc_scores = {}
264
+ for pdf_name, doc_info in doc_tag_mapping.items():
265
+ doc_tags = doc_info['tags']
266
+
267
+ # Calculate document score using max similarity for precise tag matching
268
+ if doc_tags:
269
+ similarities = [tag_similarities.get(tag, 0) for tag in doc_tags]
270
+ # Use max similarity to find documents with best tag matches
271
+ doc_score = max(similarities)
272
+ doc_scores[pdf_name] = doc_score
273
+
274
+ # Filter out documents below minimum similarity threshold
275
+ filtered_docs = {doc: score for doc, score in doc_scores.items()
276
+ if score >= min_similarity}
277
+
278
+ if not filtered_docs:
279
+ logger.warning(f"⚠️ No documents found above similarity threshold {min_similarity}")
280
+ return []
281
+
282
+ # Sort documents by similarity score
283
+ sorted_docs = sorted(filtered_docs.items(), key=lambda x: x[1], reverse=True)
284
+
285
+ # Apply TOP-P selection
286
+ total_score = sum(score for _, score in sorted_docs)
287
+ cumulative_prob = 0.0
288
+ selected_docs = []
289
+
290
+ for doc_name, score in sorted_docs:
291
+ prob = score / total_score
292
+ cumulative_prob += prob
293
+ selected_docs.append(doc_name)
294
+
295
+ # Stop when we reach the TOP-P threshold
296
+ if cumulative_prob >= top_p:
297
+ break
298
+
299
+ logger.info(f"πŸš€ Found {len(selected_docs)} relevant documents for query: '{query}' (ANNOY TOP-P={top_p})")
300
+ logger.info(f"πŸ“Š Cumulative probability: {cumulative_prob:.3f}")
301
+
302
+ for i, doc_name in enumerate(selected_docs):
303
+ score = doc_scores[doc_name]
304
+ prob = score / total_score
305
+ logger.info(f" {i+1}. {doc_name} (similarity: {score:.3f}, prob: {prob:.3f})")
306
+
307
+ return selected_docs
308
+
309
+
310
+ def find_relevant_documents_annoy_threshold(query: str, model: SentenceTransformer,
311
+ annoy_manager: AnnoyIndexManager, doc_tag_mapping: Dict,
312
+ similarity_threshold: float = 0.5, search_neighbors: int = 50) -> List[str]:
313
+ """Find all documents above a similarity threshold using ANNOY acceleration."""
314
+ query_embedding = create_text_embedding(model, query)
315
+
316
+ # Use ANNOY to find similar tags quickly
317
+ similar_tags, distances = annoy_manager.search_tags(
318
+ query_embedding, n_neighbors=search_neighbors, include_distances=True
319
+ )
320
+
321
+ # Convert angular distances to cosine similarities
322
+ tag_similarities = {}
323
+ for tag, distance in zip(similar_tags, distances):
324
+ similarity = convert_angular_distance_to_cosine_similarity(distance)
325
+ tag_similarities[tag] = similarity
326
+
327
+ # Find documents that contain the most similar tags
328
+ doc_scores = {}
329
+ for pdf_name, doc_info in doc_tag_mapping.items():
330
+ doc_tags = doc_info['tags']
331
+
332
+ # Calculate document score using weighted average
333
+ if doc_tags:
334
+ similarities = [tag_similarities.get(tag, 0) for tag in doc_tags]
335
+ avg_similarity = np.mean(similarities)
336
+ max_similarity = max(similarities)
337
+ # Weighted combination: 70% average (overall relevance) + 30% max (strongest match)
338
+ doc_score = avg_similarity * 0.7 + max_similarity * 0.3
339
+ if doc_score >= similarity_threshold:
340
+ doc_scores[pdf_name] = doc_score
341
+
342
+ # Sort by similarity score
343
+ sorted_docs = sorted(doc_scores.items(), key=lambda x: x[1], reverse=True)
344
+ relevant_docs = [doc_name for doc_name, score in sorted_docs]
345
+
346
+ logger.info(f"πŸš€ Found {len(relevant_docs)} relevant documents for query: '{query}' (ANNOY threshold={similarity_threshold})")
347
+ for i, doc_name in enumerate(relevant_docs):
348
+ score = doc_scores[doc_name]
349
+ logger.info(f" {i+1}. {doc_name} (similarity: {score:.3f})")
350
+
351
+ return relevant_docs
352
+
353
+
354
+ def find_relevant_documents_annoy(query: str, model: SentenceTransformer,
355
+ annoy_manager: AnnoyIndexManager, doc_tag_mapping: Dict,
356
+ strategy: str = "top_k", **kwargs) -> List[str]:
357
+ """Unified interface for ANNOY-accelerated document retrieval with different strategies."""
358
+ if strategy == "top_k":
359
+ top_k = kwargs.get("top_k", 3)
360
+ search_neighbors = kwargs.get("search_neighbors", 20)
361
+ return find_relevant_documents_annoy_top_k(query, model, annoy_manager, doc_tag_mapping, top_k, search_neighbors)
362
+
363
+ elif strategy == "top_p":
364
+ top_p = kwargs.get("top_p", 0.6)
365
+ min_similarity = kwargs.get("min_similarity", 0.5)
366
+ search_neighbors = kwargs.get("search_neighbors", 30)
367
+ return find_relevant_documents_annoy_top_p(query, model, annoy_manager, doc_tag_mapping, top_p, min_similarity, search_neighbors)
368
+
369
+ elif strategy == "threshold":
370
+ similarity_threshold = kwargs.get("similarity_threshold", 0.5)
371
+ search_neighbors = kwargs.get("search_neighbors", 50)
372
+ return find_relevant_documents_annoy_threshold(query, model, annoy_manager, doc_tag_mapping, similarity_threshold, search_neighbors)
373
+
374
+ else:
375
+ raise ValueError(f"Unknown strategy: {strategy}. Use 'top_k', 'top_p', or 'threshold'")
376
+
377
+
378
+ def find_relevant_documents_with_fallback(query: str, model: SentenceTransformer,
379
+ tag_embeddings: Dict, doc_tag_mapping: Dict,
380
+ annoy_manager: Optional[AnnoyIndexManager] = None,
381
+ strategy: str = "top_k", **kwargs) -> List[str]:
382
+ """
383
+ Find relevant documents with ANNOY acceleration and fallback to original method.
384
+
385
+ This function automatically uses ANNOY if available, otherwise falls back to original search.
386
+ """
387
+ if annoy_manager is not None:
388
+ try:
389
+ logger.info("πŸš€ Using ANNOY-accelerated document retrieval")
390
+ return find_relevant_documents_annoy(query, model, annoy_manager, doc_tag_mapping, strategy, **kwargs)
391
+ except Exception as e:
392
+ logger.warning(f"⚠️ ANNOY retrieval failed, falling back to original method: {e}")
393
+
394
+ # Fallback to original method
395
+ logger.info("πŸ” Using original document retrieval method")
396
+ return find_relevant_documents(query, model, tag_embeddings, doc_tag_mapping, strategy, **kwargs)
{src/pdf-version β†’ customization/src}/utils/__init__.py RENAMED
File without changes
{src/pdf-version β†’ customization/src}/utils/helpers.py RENAMED
File without changes
customization/test/test_pipeline.py ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Test script to verify the customization pipeline with ANNOY indices."""
3
+
4
+ import sys
5
+ from pathlib import Path
6
+
7
+ # Add parent directory to path
8
+ sys.path.insert(0, str(Path(__file__).parent.parent))
9
+
10
+ from customization_pipeline import retrieve_document_chunks
11
+
12
+
13
+ def test_pipeline():
14
+ """Test the complete pipeline with different queries."""
15
+ print("πŸ§ͺ Testing Customization Pipeline with ANNOY Indices")
16
+ print("=" * 60)
17
+
18
+ # Test queries
19
+ test_queries = [
20
+ "chest pain and shortness of breath",
21
+ "pregnancy bleeding emergency",
22
+ "atrial fibrillation treatment",
23
+ "fever of unknown origin",
24
+ "dizziness diagnostic approach"
25
+ ]
26
+
27
+ for query in test_queries:
28
+ print(f"\nπŸ“‹ Query: '{query}'")
29
+ print("-" * 60)
30
+
31
+ try:
32
+ # Retrieve chunks
33
+ results = retrieve_document_chunks(query, top_k=3)
34
+
35
+ if results:
36
+ print(f"βœ… Found {len(results)} relevant chunks:\n")
37
+
38
+ for i, result in enumerate(results, 1):
39
+ print(f"Result {i}:")
40
+ print(f" πŸ“„ Document: {result['document']}")
41
+ print(f" πŸ“Š Score: {result['score']:.4f}")
42
+ print(f" πŸ“ Chunk ID: {result['metadata']['chunk_id']}")
43
+ print(f" πŸ“– Text Preview: {result['chunk_text'][:150]}...")
44
+ print()
45
+ else:
46
+ print("❌ No results found")
47
+
48
+ except Exception as e:
49
+ print(f"❌ Error processing query: {e}")
50
+ import traceback
51
+ traceback.print_exc()
52
+
53
+ print("\n" + "=" * 60)
54
+ print("βœ… Pipeline test completed!")
55
+
56
+
57
+ def test_specific_medical_cases():
58
+ """Test specific medical scenarios."""
59
+ print("\n\nπŸ₯ Testing Specific Medical Cases")
60
+ print("=" * 60)
61
+
62
+ medical_cases = {
63
+ "Cardiac Emergency": "acute coronary syndrome ST elevation",
64
+ "Neurological": "stroke symptoms thrombolysis window",
65
+ "Respiratory": "pulmonary embolism Wells score",
66
+ "Obstetric Emergency": "eclampsia magnesium sulfate",
67
+ "Pediatric": "pediatric seizure management"
68
+ }
69
+
70
+ for case_type, query in medical_cases.items():
71
+ print(f"\nπŸ” {case_type}: '{query}'")
72
+ print("-" * 60)
73
+
74
+ results = retrieve_document_chunks(query, top_k=2)
75
+
76
+ if results:
77
+ for result in results:
78
+ print(f"πŸ“„ {result['document']}")
79
+ print(f" Score: {result['score']:.4f}")
80
+ print(f" Relevant content found in chunk {result['metadata']['chunk_id']}")
81
+ else:
82
+ print(" No specific guidance found")
83
+
84
+
85
+ def test_performance():
86
+ """Test retrieval performance."""
87
+ import time
88
+
89
+ print("\n\n⚑ Testing Retrieval Performance")
90
+ print("=" * 60)
91
+
92
+ queries = [
93
+ "chest pain",
94
+ "headache emergency",
95
+ "fever neutropenia",
96
+ "pneumonia antibiotics",
97
+ "atrial fibrillation"
98
+ ]
99
+
100
+ total_time = 0
101
+ for query in queries:
102
+ start_time = time.time()
103
+ results = retrieve_document_chunks(query, top_k=5)
104
+ elapsed = time.time() - start_time
105
+ total_time += elapsed
106
+
107
+ print(f"Query: '{query}' - Retrieved {len(results)} chunks in {elapsed:.3f}s")
108
+
109
+ avg_time = total_time / len(queries)
110
+ print(f"\nπŸ“Š Average retrieval time: {avg_time:.3f}s per query")
111
+
112
+
113
+ if __name__ == "__main__":
114
+ # Run all tests
115
+ test_pipeline()
116
+ test_specific_medical_cases()
117
+ test_performance()
customization_requirements.txt ADDED
@@ -0,0 +1,188 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Customization Pipeline Requirements
2
+ # Generated from rag_env environment for hospital-specific document processing
3
+ #
4
+ # Key libraries:
5
+ # - sentence-transformers: Medical domain embeddings (BGE-Large-Medical)
6
+ # - torch: Deep learning framework
7
+ # - annoy: Fast vector similarity search indices
8
+ # - pdfplumber: PDF text and table extraction
9
+ # - llama-index: Document chunking and processing
10
+ # - transformers: Hugging Face model support
11
+ # - openai: LLM integration (optional)
12
+ #
13
+ # Install with: pip install -r customization_requirements.txt
14
+ #
15
+ accelerate==1.9.0
16
+ acres==0.5.0
17
+ aiohappyeyeballs==2.6.1
18
+ aiohttp==3.12.14
19
+ aiosignal==1.4.0
20
+ aiosqlite==0.21.0
21
+ annotated-types==0.7.0
22
+ annoy==1.17.3
23
+ anyio==4.9.0
24
+ appnope==0.1.4
25
+ asttokens==3.0.0
26
+ attrs==25.3.0
27
+ banks==2.2.0
28
+ beautifulsoup4==4.13.4
29
+ bm25s==0.2.13
30
+ certifi==2025.7.14
31
+ cffi==1.17.1
32
+ charset-normalizer==3.4.2
33
+ ci-info==0.3.0
34
+ click==8.2.1
35
+ colorama==0.4.6
36
+ comm==0.2.2
37
+ configobj==5.0.9
38
+ configparser==7.2.0
39
+ cryptography==45.0.5
40
+ dataclasses-json==0.6.7
41
+ debugpy==1.8.15
42
+ decorator==5.2.1
43
+ defusedxml==0.7.1
44
+ Deprecated==1.2.18
45
+ dirtyjson==1.0.8
46
+ distro==1.9.0
47
+ easyocr==1.7.2
48
+ etelemetry==0.3.1
49
+ executing==2.2.0
50
+ filelock==3.18.0
51
+ filetype==1.2.0
52
+ fitz==0.0.1.dev2
53
+ frozenlist==1.7.0
54
+ fsspec==2025.7.0
55
+ greenlet==3.2.3
56
+ griffe==1.7.3
57
+ h11==0.16.0
58
+ hf-xet==1.1.5
59
+ httpcore==1.0.9
60
+ httplib2==0.22.0
61
+ httpx==0.28.1
62
+ huggingface-hub==0.33.4
63
+ idna==3.10
64
+ imageio==2.37.0
65
+ ipykernel==6.30.0
66
+ ipython==9.4.0
67
+ ipython_pygments_lexers==1.1.1
68
+ jedi==0.19.2
69
+ Jinja2==3.1.6
70
+ jiter==0.10.0
71
+ joblib==1.5.1
72
+ jpype1==1.6.0
73
+ jupyter_client==8.6.3
74
+ jupyter_core==5.8.1
75
+ lazy_loader==0.4
76
+ llama-cloud==0.1.32
77
+ llama-cloud-services==0.6.43
78
+ llama-index==0.12.50
79
+ llama-index-agent-openai==0.4.12
80
+ llama-index-cli==0.4.4
81
+ llama-index-core==0.12.50
82
+ llama-index-embeddings-huggingface==0.5.5
83
+ llama-index-embeddings-openai==0.3.1
84
+ llama-index-indices-managed-llama-cloud==0.7.10
85
+ llama-index-instrumentation==0.3.0
86
+ llama-index-llms-huggingface==0.5.0
87
+ llama-index-llms-openai==0.4.7
88
+ llama-index-llms-openai-like==0.4.0
89
+ llama-index-llms-openrouter==0.3.2
90
+ llama-index-multi-modal-llms-openai==0.5.3
91
+ llama-index-program-openai==0.3.2
92
+ llama-index-question-gen-openai==0.3.1
93
+ llama-index-readers-file==0.4.11
94
+ llama-index-readers-llama-parse==0.4.0
95
+ llama-index-retrievers-bm25==0.5.2
96
+ llama-index-workflows==1.1.0
97
+ llama-parse==0.6.43
98
+ looseversion==1.3.0
99
+ lxml==6.0.0
100
+ MarkupSafe==3.0.2
101
+ marshmallow==3.26.1
102
+ matplotlib-inline==0.1.7
103
+ mpmath==1.3.0
104
+ multidict==6.6.3
105
+ mypy_extensions==1.1.0
106
+ nest-asyncio==1.6.0
107
+ networkx==3.5
108
+ nibabel==5.3.2
109
+ ninja==1.11.1.4
110
+ nipype==1.10.0
111
+ nltk==3.9.1
112
+ numpy==2.2.6
113
+ openai==1.97.0
114
+ opencv-python-headless==4.12.0.88
115
+ packaging==25.0
116
+ pandas==2.2.3
117
+ parso==0.8.4
118
+ pathlib==1.0.1
119
+ pdfminer.six==20250506
120
+ pdfplumber==0.11.7
121
+ pexpect==4.9.0
122
+ pillow==11.3.0
123
+ platformdirs==4.3.8
124
+ prompt_toolkit==3.0.51
125
+ propcache==0.3.2
126
+ prov==2.1.1
127
+ psutil==7.0.0
128
+ ptyprocess==0.7.0
129
+ pure_eval==0.2.3
130
+ puremagic==1.30
131
+ pyclipper==1.3.0.post6
132
+ pycparser==2.22
133
+ pydantic==2.11.7
134
+ pydantic_core==2.33.2
135
+ pydot==4.0.1
136
+ Pygments==2.19.2
137
+ PyMuPDF==1.26.3
138
+ pyparsing==3.2.3
139
+ pypdf==5.8.0
140
+ pypdfium2==4.30.0
141
+ PyStemmer==2.2.0.3
142
+ python-bidi==0.6.6
143
+ python-dateutil==2.9.0.post0
144
+ python-dotenv==1.1.1
145
+ pytz==2025.2
146
+ pyxnat==1.6.3
147
+ PyYAML==6.0.2
148
+ pyzmq==27.0.0
149
+ rdflib==7.1.4
150
+ regex==2024.11.6
151
+ requests==2.32.4
152
+ safetensors==0.5.3
153
+ scikit-image==0.25.2
154
+ scikit-learn==1.7.1
155
+ scipy==1.16.0
156
+ sentence-transformers==5.0.0
157
+ setuptools==80.9.0
158
+ shapely==2.1.1
159
+ simplejson==3.20.1
160
+ six==1.17.0
161
+ sniffio==1.3.1
162
+ soupsieve==2.7
163
+ SQLAlchemy==2.0.41
164
+ stack-data==0.6.3
165
+ striprtf==0.0.26
166
+ sympy==1.14.0
167
+ tabula-py==2.10.0
168
+ tabulate==0.9.0
169
+ tenacity==9.1.2
170
+ threadpoolctl==3.6.0
171
+ tifffile==2025.6.11
172
+ tiktoken==0.9.0
173
+ tokenizers==0.21.2
174
+ torch==2.7.1
175
+ torchvision==0.22.1
176
+ tornado==6.5.1
177
+ tqdm==4.67.1
178
+ traitlets==5.14.3
179
+ traits==7.0.2
180
+ transformers==4.53.2
181
+ typing-inspect==0.9.0
182
+ typing-inspection==0.4.1
183
+ typing_extensions==4.14.1
184
+ tzdata==2025.2
185
+ urllib3==2.5.0
186
+ wcwidth==0.2.13
187
+ wrapt==1.17.2
188
+ yarl==1.20.1
src/pdf-version/data/__init__.py DELETED
@@ -1,15 +0,0 @@
1
- """Data loading and PDF processing."""
2
-
3
- from .loaders import load_annotations, filter_pdf_files
4
- from .pdf_processing import (
5
- extract_pdf_text,
6
- extract_tables_from_pdf,
7
- extract_images_ocr_from_pdf,
8
- extract_pdf_content_enhanced
9
- )
10
-
11
- __all__ = [
12
- 'load_annotations', 'filter_pdf_files',
13
- 'extract_pdf_text', 'extract_tables_from_pdf',
14
- 'extract_images_ocr_from_pdf', 'extract_pdf_content_enhanced'
15
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/pdf-version/main.py DELETED
@@ -1,83 +0,0 @@
1
- #!/usr/bin/env python3
2
- """OnCall AI - Medical RAG System
3
-
4
- Main entry point for the medical RAG system.
5
- """
6
-
7
- import sys
8
- from pathlib import Path
9
-
10
- # Add pdf-version directory to Python path
11
- sys.path.insert(0, str(Path(__file__).parent))
12
-
13
- from demos.demo_runner import build_medical_rag_system, demo_rag_query, demo_all_strategies
14
-
15
-
16
- def main():
17
- """Main program entry point."""
18
- try:
19
- # Build the system with chunk embeddings
20
- build_medical_rag_system(enable_chunk_embeddings=True)
21
-
22
- # Demo chunk-based retrieval
23
- print("\n" + "="*80)
24
- print("🧩 CHUNK-BASED RETRIEVAL DEMO")
25
- print("="*80)
26
- demo_rag_query("chest pain and shortness of breath",
27
- strategy="top_p", use_chunks=True, top_p=0.8)
28
-
29
- except KeyboardInterrupt:
30
- print("\n\nπŸ‘‹ User interrupted, program exiting")
31
- except Exception as e:
32
- print(f"\n❌ Program execution error: {e}")
33
- import traceback
34
- traceback.print_exc()
35
-
36
-
37
- def interactive_demo():
38
- """Interactive demo mode."""
39
- print("πŸ₯ OnCall AI - Interactive Demo Mode")
40
- print("=" * 50)
41
-
42
- while True:
43
- print("\nOptions:")
44
- print("1. Build/rebuild system")
45
- print("2. Query with TOP-P strategy")
46
- print("3. Query with TOP-K strategy")
47
- print("4. Compare all strategies")
48
- print("5. Custom query")
49
- print("6. Exit")
50
-
51
- choice = input("\nSelect option (1-6): ").strip()
52
-
53
- if choice == "1":
54
- build_medical_rag_system(enable_chunk_embeddings=True)
55
- elif choice == "2":
56
- query = input("Enter your query: ").strip()
57
- if query:
58
- demo_rag_query(query, strategy="top_p", use_chunks=True)
59
- elif choice == "3":
60
- query = input("Enter your query: ").strip()
61
- if query:
62
- demo_rag_query(query, strategy="top_k", use_chunks=True, top_k=3)
63
- elif choice == "4":
64
- query = input("Enter your query: ").strip()
65
- if query:
66
- demo_all_strategies(query)
67
- elif choice == "5":
68
- query = input("Enter your query: ").strip()
69
- strategy = input("Enter strategy (top_k/top_p/threshold): ").strip()
70
- if query and strategy:
71
- demo_rag_query(query, strategy=strategy, use_chunks=True)
72
- elif choice == "6":
73
- print("πŸ‘‹ Goodbye!")
74
- break
75
- else:
76
- print("❌ Invalid option. Please select 1-6.")
77
-
78
-
79
- if __name__ == "__main__":
80
- if len(sys.argv) > 1 and sys.argv[1] == "--interactive":
81
- interactive_demo()
82
- else:
83
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/pdf-version/oncall_ai.py DELETED
@@ -1,55 +0,0 @@
1
- #!/usr/bin/env python3
2
- """OnCall AI - Medical RAG System (Backward Compatibility)
3
-
4
- This file provides backward compatibility with the original rag.py interface.
5
- Import everything from the new modular structure.
6
- """
7
-
8
- import sys
9
- from pathlib import Path
10
-
11
- # Add pdf-version directory to Python path
12
- sys.path.insert(0, str(Path(__file__).parent))
13
-
14
- # Import all functions for backward compatibility
15
- from models.embedding_models import load_biomedbert_model, load_meditron_model
16
- from data.loaders import load_annotations, filter_pdf_files
17
- from data.pdf_processing import (
18
- extract_pdf_text, extract_tables_from_pdf,
19
- extract_images_ocr_from_pdf, extract_pdf_content_enhanced
20
- )
21
- from indexing.document_indexer import build_document_index, split_text_into_chunks
22
- from indexing.embedding_creator import create_text_embedding, create_tag_embeddings, create_chunk_embeddings
23
- from indexing.storage import save_document_system, load_document_system
24
- from retrieval.document_retriever import (
25
- find_relevant_documents_top_k, find_relevant_documents_top_p,
26
- find_relevant_documents_threshold, find_relevant_documents,
27
- create_document_tag_mapping
28
- )
29
- from retrieval.chunk_retriever import find_relevant_chunks, get_documents_for_rag, get_chunks_for_rag
30
- from demos.demo_runner import build_medical_rag_system, demo_rag_query, demo_all_strategies
31
-
32
- # Main function for backward compatibility
33
- def main():
34
- """Main program entry compatible with original rag.py."""
35
- try:
36
- # Build the system with chunk embeddings
37
- build_medical_rag_system(enable_chunk_embeddings=True)
38
-
39
- # Demo chunk-based retrieval
40
- print("\n" + "="*80)
41
- print("🧩 CHUNK-BASED RETRIEVAL DEMO")
42
- print("="*80)
43
- demo_rag_query("chest pain and shortness of breath",
44
- strategy="top_p", use_chunks=True, top_p=0.8)
45
-
46
- except KeyboardInterrupt:
47
- print("\n\nπŸ‘‹ User interrupted, program exiting")
48
- except Exception as e:
49
- print(f"\n❌ Program execution error: {e}")
50
- import traceback
51
- traceback.print_exc()
52
-
53
-
54
- if __name__ == "__main__":
55
- main()