Spaces:
Sleeping
Sleeping
feat: implement hospital-specific customization pipeline with two-stage ANNOY retrieval
Browse files- Restructure pdf-version to customization/ for hospital-specific deployment
- Add customization_pipeline.py with two-stage retrieval (tag -> document -> chunk)
- Implement ANNOY indices for fast medical concept and chunk similarity search
- Add generate_embeddings.py for building hospital-specific embeddings
- Create test suite validating end-to-end pipeline functionality
- Add customization_requirements.txt with all necessary dependencies
- Update .gitignore to exclude rag_env/ virtual environment
This enables hospitals to deploy their own customized medical RAG systems
with private documents while maintaining the base medical knowledge framework.
- .gitignore +1 -0
- customization/customization_pipeline.py +156 -0
- {src/pdf-version β customization}/generate_embeddings.py +14 -18
- {src/pdf-version β customization/src}/__init__.py +0 -0
- customization/src/data/__init__.py +23 -0
- {src/pdf-version β customization/src}/data/loaders.py +0 -0
- {src/pdf-version β customization/src}/data/pdf_processing.py +0 -0
- {src/pdf-version β customization/src}/demos/__init__.py +0 -0
- {src/pdf-version β customization/src}/demos/demo_runner.py +157 -4
- {src/pdf-version β customization/src}/indexing/__init__.py +0 -0
- customization/src/indexing/annoy_manager.py +392 -0
- {src/pdf-version β customization/src}/indexing/document_indexer.py +0 -0
- {src/pdf-version β customization/src}/indexing/embedding_creator.py +0 -0
- {src/pdf-version β customization/src}/indexing/storage.py +91 -2
- {src/pdf-version β customization/src}/models/__init__.py +0 -0
- {src/pdf-version β customization/src}/models/embedding_models.py +0 -0
- {src/pdf-version β customization/src}/rag/__init__.py +0 -0
- {src/pdf-version β customization/src}/rag/medical_rag_pipeline.py +0 -0
- {src/pdf-version β customization/src}/retrieval/__init__.py +0 -0
- {src/pdf-version β customization/src}/retrieval/chunk_retriever.py +177 -3
- {src/pdf-version β customization/src}/retrieval/document_retriever.py +207 -3
- {src/pdf-version β customization/src}/utils/__init__.py +0 -0
- {src/pdf-version β customization/src}/utils/helpers.py +0 -0
- customization/test/test_pipeline.py +117 -0
- customization_requirements.txt +188 -0
- src/pdf-version/data/__init__.py +0 -15
- src/pdf-version/main.py +0 -83
- src/pdf-version/oncall_ai.py +0 -55
.gitignore
CHANGED
@@ -1,6 +1,7 @@
|
|
1 |
# π§ Virtual environments
|
2 |
genAIvenv/
|
3 |
.final_project_env/
|
|
|
4 |
.env
|
5 |
.venv
|
6 |
env/
|
|
|
1 |
# π§ Virtual environments
|
2 |
genAIvenv/
|
3 |
.final_project_env/
|
4 |
+
rag_env/
|
5 |
.env
|
6 |
.venv
|
7 |
env/
|
customization/customization_pipeline.py
ADDED
@@ -0,0 +1,156 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
"""Customization Pipeline - Hospital-Specific Document Retrieval
|
3 |
+
|
4 |
+
This module provides the interface for hospital-specific document processing and retrieval.
|
5 |
+
"""
|
6 |
+
|
7 |
+
import sys
|
8 |
+
from pathlib import Path
|
9 |
+
from typing import List, Dict
|
10 |
+
|
11 |
+
# Add src directory to Python path
|
12 |
+
sys.path.insert(0, str(Path(__file__).parent / 'src'))
|
13 |
+
|
14 |
+
# Import necessary modules
|
15 |
+
from models.embedding_models import load_biomedbert_model
|
16 |
+
from data.loaders import load_annotations
|
17 |
+
from indexing.document_indexer import build_document_index
|
18 |
+
from indexing.embedding_creator import create_tag_embeddings, create_chunk_embeddings
|
19 |
+
from indexing.storage import save_document_system, load_document_system_with_annoy
|
20 |
+
from retrieval.document_retriever import create_document_tag_mapping
|
21 |
+
from retrieval.chunk_retriever import find_relevant_chunks_with_fallback
|
22 |
+
|
23 |
+
|
24 |
+
def build_customization_embeddings():
|
25 |
+
"""Build embeddings for the hospital-specific documents in the docs folder."""
|
26 |
+
print("π₯ Building hospital-specific embeddings...")
|
27 |
+
|
28 |
+
# Paths
|
29 |
+
base_path = Path(__file__).parent
|
30 |
+
docs_path = base_path / "docs"
|
31 |
+
processing_path = base_path / "processing"
|
32 |
+
|
33 |
+
# Load model and annotations
|
34 |
+
embedding_model = load_biomedbert_model()
|
35 |
+
annotations = load_annotations(file_path=str(processing_path / "mapping.json"))
|
36 |
+
|
37 |
+
if not annotations:
|
38 |
+
print("β Unable to load annotation data")
|
39 |
+
return False
|
40 |
+
|
41 |
+
# Build document index with chunks
|
42 |
+
print("π Processing documents...")
|
43 |
+
document_index = build_document_index(
|
44 |
+
annotations,
|
45 |
+
assets_dir=str(docs_path),
|
46 |
+
chunk_size=256,
|
47 |
+
chunk_overlap=25
|
48 |
+
)
|
49 |
+
|
50 |
+
# Create embeddings
|
51 |
+
print("π’ Creating embeddings...")
|
52 |
+
tag_embeddings = create_tag_embeddings(embedding_model, document_index)
|
53 |
+
doc_tag_mapping = create_document_tag_mapping(document_index, tag_embeddings)
|
54 |
+
chunk_embeddings = create_chunk_embeddings(embedding_model, document_index)
|
55 |
+
|
56 |
+
# Save everything
|
57 |
+
print("πΎ Saving to processing folder...")
|
58 |
+
save_document_system(
|
59 |
+
document_index,
|
60 |
+
tag_embeddings,
|
61 |
+
doc_tag_mapping,
|
62 |
+
chunk_embeddings,
|
63 |
+
output_dir=str(processing_path / "embeddings"),
|
64 |
+
build_annoy_indices=True
|
65 |
+
)
|
66 |
+
|
67 |
+
print("β
Embeddings built successfully!")
|
68 |
+
return True
|
69 |
+
|
70 |
+
|
71 |
+
def retrieve_document_chunks(query: str, top_k: int = 5) -> List[Dict]:
|
72 |
+
"""Retrieve relevant document chunks using two-stage ANNOY retrieval.
|
73 |
+
|
74 |
+
Stage 1: Find relevant documents using tag embeddings (medical concepts)
|
75 |
+
Stage 2: Find relevant chunks within those documents using chunk embeddings
|
76 |
+
|
77 |
+
Args:
|
78 |
+
query: The search query
|
79 |
+
top_k: Number of chunks to retrieve
|
80 |
+
|
81 |
+
Returns:
|
82 |
+
List of dictionaries containing chunk information
|
83 |
+
"""
|
84 |
+
# Load model and existing embeddings
|
85 |
+
embedding_model = load_biomedbert_model()
|
86 |
+
|
87 |
+
# Load from processing folder
|
88 |
+
processing_path = Path(__file__).parent / "processing"
|
89 |
+
|
90 |
+
# Load the saved system with ANNOY indices
|
91 |
+
document_index, tag_embeddings, doc_tag_mapping, chunk_embeddings, annoy_manager = \
|
92 |
+
load_document_system_with_annoy(
|
93 |
+
input_dir=str(processing_path / "embeddings"),
|
94 |
+
annoy_dir=str(processing_path / "indices")
|
95 |
+
)
|
96 |
+
|
97 |
+
if annoy_manager is None:
|
98 |
+
print("β Failed to load ANNOY manager")
|
99 |
+
return []
|
100 |
+
|
101 |
+
# Create query embedding
|
102 |
+
query_embedding = embedding_model.encode(query)
|
103 |
+
|
104 |
+
# Stage 1: Find relevant documents using tag ANNOY index
|
105 |
+
print(f"π Stage 1: Finding relevant documents for query: '{query}'")
|
106 |
+
relevant_tags, tag_distances = annoy_manager.search_tags(
|
107 |
+
query_embedding=query_embedding,
|
108 |
+
n_neighbors=20, # Get more tags to find diverse documents
|
109 |
+
include_distances=True
|
110 |
+
)
|
111 |
+
|
112 |
+
# Get documents that contain these relevant tags
|
113 |
+
relevant_docs = set()
|
114 |
+
for tag in relevant_tags[:10]: # Use top 10 tags
|
115 |
+
for doc_name, doc_info in doc_tag_mapping.items():
|
116 |
+
if tag in doc_info['tags']:
|
117 |
+
relevant_docs.add(doc_name)
|
118 |
+
|
119 |
+
relevant_docs = list(relevant_docs)
|
120 |
+
print(f"β
Found {len(relevant_docs)} relevant documents based on medical tags")
|
121 |
+
|
122 |
+
if not relevant_docs:
|
123 |
+
print("β No relevant documents found")
|
124 |
+
return []
|
125 |
+
|
126 |
+
# Stage 2: Find relevant chunks within these documents using chunk ANNOY index
|
127 |
+
print(f"π Stage 2: Finding relevant chunks within {len(relevant_docs)} documents")
|
128 |
+
chunks, chunk_distances = annoy_manager.search_chunks_in_documents(
|
129 |
+
query_embedding=query_embedding,
|
130 |
+
document_names=relevant_docs,
|
131 |
+
n_neighbors=top_k,
|
132 |
+
include_distances=True
|
133 |
+
)
|
134 |
+
|
135 |
+
# Convert ANNOY distances to cosine similarities
|
136 |
+
from indexing.annoy_manager import convert_angular_distance_to_cosine_similarity
|
137 |
+
|
138 |
+
# Format results
|
139 |
+
results = []
|
140 |
+
for chunk, distance in zip(chunks, chunk_distances):
|
141 |
+
# Convert angular distance to cosine similarity
|
142 |
+
similarity = convert_angular_distance_to_cosine_similarity(distance)
|
143 |
+
|
144 |
+
results.append({
|
145 |
+
'document': chunk['document'],
|
146 |
+
'chunk_text': chunk['text'],
|
147 |
+
'score': similarity,
|
148 |
+
'metadata': {
|
149 |
+
'chunk_id': chunk['chunk_id'],
|
150 |
+
'start_char': chunk.get('start_char', 0),
|
151 |
+
'end_char': chunk.get('end_char', 0)
|
152 |
+
}
|
153 |
+
})
|
154 |
+
|
155 |
+
print(f"β
Retrieved {len(results)} relevant chunks")
|
156 |
+
return results
|
{src/pdf-version β customization}/generate_embeddings.py
RENAMED
@@ -1,18 +1,12 @@
|
|
1 |
#!/usr/bin/env python3
|
2 |
"""
|
3 |
-
|
4 |
"""
|
5 |
|
6 |
-
import
|
7 |
-
from pathlib import Path
|
8 |
-
|
9 |
-
# Add pdf-version directory to Python path
|
10 |
-
sys.path.insert(0, str(Path(__file__).parent))
|
11 |
-
|
12 |
-
from demos.demo_runner import build_medical_rag_system
|
13 |
|
14 |
def main():
|
15 |
-
print("π Starting to build
|
16 |
print("π Configuration:")
|
17 |
print(" - Chunk size: 256 tokens")
|
18 |
print(" - Chunk overlap: 25 tokens (10%)")
|
@@ -22,17 +16,19 @@ def main():
|
|
22 |
print("")
|
23 |
|
24 |
try:
|
25 |
-
|
26 |
|
27 |
-
if
|
28 |
-
print("β
Successfully built
|
29 |
-
print("π Generated files:")
|
30 |
-
print(" - document_index.json")
|
31 |
-
print(" - tag_embeddings.json")
|
32 |
-
print(" - document_tag_mapping.json")
|
33 |
-
print(" - chunk_embeddings.json")
|
|
|
|
|
34 |
else:
|
35 |
-
print("β Failed to build
|
36 |
|
37 |
except KeyboardInterrupt:
|
38 |
print("\nβ οΈ Process interrupted by user")
|
|
|
1 |
#!/usr/bin/env python3
|
2 |
"""
|
3 |
+
Generate embeddings for hospital-specific documents
|
4 |
"""
|
5 |
|
6 |
+
from customization_pipeline import build_customization_embeddings
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
|
8 |
def main():
|
9 |
+
print("π Starting to build hospital-specific embeddings...")
|
10 |
print("π Configuration:")
|
11 |
print(" - Chunk size: 256 tokens")
|
12 |
print(" - Chunk overlap: 25 tokens (10%)")
|
|
|
16 |
print("")
|
17 |
|
18 |
try:
|
19 |
+
success = build_customization_embeddings()
|
20 |
|
21 |
+
if success:
|
22 |
+
print("\nβ
Successfully built embeddings!")
|
23 |
+
print("π Generated files in processing folder:")
|
24 |
+
print(" - embeddings/document_index.json")
|
25 |
+
print(" - embeddings/tag_embeddings.json")
|
26 |
+
print(" - embeddings/document_tag_mapping.json")
|
27 |
+
print(" - embeddings/chunk_embeddings.json")
|
28 |
+
print(" - indices/annoy_metadata.json")
|
29 |
+
print(" - indices/*.ann files")
|
30 |
else:
|
31 |
+
print("\nβ Failed to build embeddings")
|
32 |
|
33 |
except KeyboardInterrupt:
|
34 |
print("\nβ οΈ Process interrupted by user")
|
{src/pdf-version β customization/src}/__init__.py
RENAMED
File without changes
|
customization/src/data/__init__.py
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Data loading and PDF processing."""
|
2 |
+
|
3 |
+
from .loaders import load_annotations, filter_pdf_files
|
4 |
+
|
5 |
+
# Try to import PDF processing functions, but handle missing dependencies gracefully
|
6 |
+
try:
|
7 |
+
from .pdf_processing import (
|
8 |
+
extract_pdf_text,
|
9 |
+
extract_tables_from_pdf,
|
10 |
+
extract_images_ocr_from_pdf,
|
11 |
+
extract_pdf_content_enhanced
|
12 |
+
)
|
13 |
+
PDF_PROCESSING_AVAILABLE = True
|
14 |
+
__all__ = [
|
15 |
+
'load_annotations', 'filter_pdf_files',
|
16 |
+
'extract_pdf_text', 'extract_tables_from_pdf',
|
17 |
+
'extract_images_ocr_from_pdf', 'extract_pdf_content_enhanced'
|
18 |
+
]
|
19 |
+
except ImportError as e:
|
20 |
+
print(f"β οΈ PDF processing not available: {e}")
|
21 |
+
print("π Only working with existing embeddings")
|
22 |
+
PDF_PROCESSING_AVAILABLE = False
|
23 |
+
__all__ = ['load_annotations', 'filter_pdf_files']
|
{src/pdf-version β customization/src}/data/loaders.py
RENAMED
File without changes
|
{src/pdf-version β customization/src}/data/pdf_processing.py
RENAMED
File without changes
|
{src/pdf-version β customization/src}/demos/__init__.py
RENAMED
File without changes
|
{src/pdf-version β customization/src}/demos/demo_runner.py
RENAMED
@@ -6,9 +6,15 @@ from models.embedding_models import load_biomedbert_model
|
|
6 |
from data.loaders import load_annotations
|
7 |
from indexing.document_indexer import build_document_index
|
8 |
from indexing.embedding_creator import create_tag_embeddings, create_chunk_embeddings
|
9 |
-
from indexing.storage import save_document_system, load_document_system
|
10 |
-
from retrieval.document_retriever import
|
11 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
|
13 |
|
14 |
def build_medical_rag_system(enable_chunk_embeddings: bool = True):
|
@@ -135,4 +141,151 @@ def demo_all_strategies(query: str = "chest pain and shortness of breath"):
|
|
135 |
for strategy, docs in results.items():
|
136 |
print(f"{strategy:>10}: {len(docs)} documents selected")
|
137 |
|
138 |
-
return results
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
from data.loaders import load_annotations
|
7 |
from indexing.document_indexer import build_document_index
|
8 |
from indexing.embedding_creator import create_tag_embeddings, create_chunk_embeddings
|
9 |
+
from indexing.storage import save_document_system, load_document_system, load_document_system_with_annoy
|
10 |
+
from retrieval.document_retriever import (
|
11 |
+
create_document_tag_mapping, find_relevant_documents,
|
12 |
+
find_relevant_documents_with_fallback
|
13 |
+
)
|
14 |
+
from retrieval.chunk_retriever import (
|
15 |
+
find_relevant_chunks, get_documents_for_rag, get_chunks_for_rag,
|
16 |
+
find_relevant_chunks_with_fallback
|
17 |
+
)
|
18 |
|
19 |
|
20 |
def build_medical_rag_system(enable_chunk_embeddings: bool = True):
|
|
|
141 |
for strategy, docs in results.items():
|
142 |
print(f"{strategy:>10}: {len(docs)} documents selected")
|
143 |
|
144 |
+
return results
|
145 |
+
|
146 |
+
|
147 |
+
def demo_rag_query_with_annoy(query: str = "chest pain and shortness of breath",
|
148 |
+
strategy: str = "top_p", use_chunks: bool = True, **kwargs):
|
149 |
+
"""Demo RAG query functionality with ANNOY acceleration."""
|
150 |
+
print(f"\nπ Demo ANNOY Query: '{query}' (Strategy: {strategy}, Use chunks: {use_chunks})")
|
151 |
+
print("=" * 80)
|
152 |
+
|
153 |
+
# Try to load existing system with ANNOY
|
154 |
+
document_index, tag_embeddings, doc_tag_mapping, chunk_embeddings, annoy_manager = load_document_system_with_annoy()
|
155 |
+
|
156 |
+
if document_index is None:
|
157 |
+
print("π¦ No saved system found, building new one...")
|
158 |
+
build_result = build_medical_rag_system(enable_chunk_embeddings=use_chunks)
|
159 |
+
if build_result[0] is None:
|
160 |
+
return
|
161 |
+
embedding_model, document_index, tag_embeddings, doc_tag_mapping, chunk_embeddings = build_result
|
162 |
+
|
163 |
+
# Try to load ANNOY manager after building
|
164 |
+
from indexing.storage import load_annoy_manager
|
165 |
+
annoy_manager = load_annoy_manager()
|
166 |
+
else:
|
167 |
+
embedding_model = load_biomedbert_model()
|
168 |
+
|
169 |
+
print(f"π§ ANNOY Status: {'Available' if annoy_manager else 'Not available (using fallback)'}")
|
170 |
+
|
171 |
+
# Find relevant documents using ANNOY-accelerated method with fallback
|
172 |
+
print(f"\nπ Finding relevant documents...")
|
173 |
+
import time
|
174 |
+
start_time = time.time()
|
175 |
+
|
176 |
+
relevant_docs = find_relevant_documents_with_fallback(
|
177 |
+
query, embedding_model, tag_embeddings, doc_tag_mapping,
|
178 |
+
annoy_manager=annoy_manager, strategy=strategy, **kwargs
|
179 |
+
)
|
180 |
+
|
181 |
+
doc_search_time = time.time() - start_time
|
182 |
+
print(f"β±οΈ Document search completed in {doc_search_time:.4f}s")
|
183 |
+
|
184 |
+
if use_chunks and chunk_embeddings:
|
185 |
+
# Find relevant chunks using ANNOY-accelerated method with fallback
|
186 |
+
print(f"\nπ Finding relevant chunks within selected documents...")
|
187 |
+
start_time = time.time()
|
188 |
+
|
189 |
+
relevant_chunks = find_relevant_chunks_with_fallback(
|
190 |
+
query, embedding_model, relevant_docs, chunk_embeddings,
|
191 |
+
annoy_manager=annoy_manager, strategy=strategy,
|
192 |
+
top_chunks_per_doc=3, **kwargs
|
193 |
+
)
|
194 |
+
|
195 |
+
chunk_search_time = time.time() - start_time
|
196 |
+
print(f"β±οΈ Chunk search completed in {chunk_search_time:.4f}s")
|
197 |
+
|
198 |
+
# Get chunks for RAG
|
199 |
+
rag_content = get_chunks_for_rag(relevant_chunks, max_chunks=10)
|
200 |
+
print(f"\nπ Ready for RAG with {len(rag_content)} chunks")
|
201 |
+
|
202 |
+
total_time = doc_search_time + chunk_search_time
|
203 |
+
print(f"π Total search time: {total_time:.4f}s")
|
204 |
+
|
205 |
+
else:
|
206 |
+
# Get full documents for RAG
|
207 |
+
rag_content = get_documents_for_rag(relevant_docs, document_index)
|
208 |
+
print(f"\nπ Ready for RAG with {len(rag_content)} full documents")
|
209 |
+
print(f"π Total search time: {doc_search_time:.4f}s")
|
210 |
+
|
211 |
+
return rag_content
|
212 |
+
|
213 |
+
|
214 |
+
def demo_performance_comparison(query: str = "chest pain and shortness of breath"):
|
215 |
+
"""Demo performance comparison between original and ANNOY methods."""
|
216 |
+
print(f"\nβ‘ Performance Comparison Demo")
|
217 |
+
print("=" * 80)
|
218 |
+
print(f"Query: '{query}'")
|
219 |
+
|
220 |
+
# Load system with ANNOY
|
221 |
+
document_index, tag_embeddings, doc_tag_mapping, chunk_embeddings, annoy_manager = load_document_system_with_annoy()
|
222 |
+
|
223 |
+
if document_index is None:
|
224 |
+
print("β No saved system found")
|
225 |
+
return
|
226 |
+
|
227 |
+
embedding_model = load_biomedbert_model()
|
228 |
+
strategy = "top_p"
|
229 |
+
strategy_params = {"top_p": 0.8, "min_similarity": 0.3}
|
230 |
+
|
231 |
+
print(f"\nπ Testing document retrieval performance...")
|
232 |
+
|
233 |
+
# Test original method
|
234 |
+
import time
|
235 |
+
start_time = time.time()
|
236 |
+
original_docs = find_relevant_documents(
|
237 |
+
query, embedding_model, tag_embeddings, doc_tag_mapping,
|
238 |
+
strategy=strategy, **strategy_params
|
239 |
+
)
|
240 |
+
original_time = time.time() - start_time
|
241 |
+
|
242 |
+
# Test ANNOY method (with fallback)
|
243 |
+
start_time = time.time()
|
244 |
+
annoy_docs = find_relevant_documents_with_fallback(
|
245 |
+
query, embedding_model, tag_embeddings, doc_tag_mapping,
|
246 |
+
annoy_manager=annoy_manager, strategy=strategy, **strategy_params
|
247 |
+
)
|
248 |
+
annoy_time = time.time() - start_time
|
249 |
+
|
250 |
+
# Results
|
251 |
+
print(f"π Original method: {len(original_docs)} docs in {original_time:.4f}s")
|
252 |
+
print(f"π ANNOY method: {len(annoy_docs)} docs in {annoy_time:.4f}s")
|
253 |
+
|
254 |
+
if annoy_time > 0:
|
255 |
+
speedup = original_time / annoy_time
|
256 |
+
print(f"β‘ Speedup: {speedup:.2f}x")
|
257 |
+
|
258 |
+
# Check result similarity
|
259 |
+
if original_docs and annoy_docs:
|
260 |
+
overlap = set(original_docs) & set(annoy_docs)
|
261 |
+
print(f"π Result overlap: {len(overlap)}/{len(original_docs)} documents")
|
262 |
+
|
263 |
+
# Test chunk retrieval if available
|
264 |
+
if chunk_embeddings and len(original_docs) > 0:
|
265 |
+
print(f"\nπ Testing chunk retrieval performance...")
|
266 |
+
relevant_docs = original_docs[:2] # Test with first 2 documents
|
267 |
+
|
268 |
+
# Original method
|
269 |
+
start_time = time.time()
|
270 |
+
original_chunks = find_relevant_chunks(
|
271 |
+
query, embedding_model, relevant_docs, chunk_embeddings,
|
272 |
+
strategy=strategy, **strategy_params
|
273 |
+
)
|
274 |
+
original_chunk_time = time.time() - start_time
|
275 |
+
|
276 |
+
# ANNOY method (with fallback)
|
277 |
+
start_time = time.time()
|
278 |
+
annoy_chunks = find_relevant_chunks_with_fallback(
|
279 |
+
query, embedding_model, relevant_docs, chunk_embeddings,
|
280 |
+
annoy_manager=annoy_manager, strategy=strategy, **strategy_params
|
281 |
+
)
|
282 |
+
annoy_chunk_time = time.time() - start_time
|
283 |
+
|
284 |
+
print(f"π Original chunks: {len(original_chunks)} chunks in {original_chunk_time:.4f}s")
|
285 |
+
print(f"π ANNOY chunks: {len(annoy_chunks)} chunks in {annoy_chunk_time:.4f}s")
|
286 |
+
|
287 |
+
if annoy_chunk_time > 0:
|
288 |
+
chunk_speedup = original_chunk_time / annoy_chunk_time
|
289 |
+
print(f"β‘ Chunk speedup: {chunk_speedup:.2f}x")
|
290 |
+
|
291 |
+
print(f"\nβ
Performance comparison completed!")
|
{src/pdf-version β customization/src}/indexing/__init__.py
RENAMED
File without changes
|
customization/src/indexing/annoy_manager.py
ADDED
@@ -0,0 +1,392 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""ANNOY index management for PDF-based RAG system."""
|
2 |
+
|
3 |
+
import os
|
4 |
+
import json
|
5 |
+
import numpy as np
|
6 |
+
from typing import Dict, List, Optional, Tuple, Union
|
7 |
+
from pathlib import Path
|
8 |
+
import logging
|
9 |
+
|
10 |
+
try:
|
11 |
+
from annoy import AnnoyIndex
|
12 |
+
except ImportError:
|
13 |
+
raise ImportError("annoy package is required. Install with: pip install annoy")
|
14 |
+
|
15 |
+
# Configure logging
|
16 |
+
logging.basicConfig(level=logging.INFO)
|
17 |
+
logger = logging.getLogger(__name__)
|
18 |
+
|
19 |
+
|
20 |
+
class AnnoyIndexManager:
|
21 |
+
"""Manages ANNOY indices for fast vector similarity search."""
|
22 |
+
|
23 |
+
def __init__(self, embedding_dim: int = 1024, metric: str = 'angular'):
|
24 |
+
"""
|
25 |
+
Initialize ANNOY index manager.
|
26 |
+
|
27 |
+
Args:
|
28 |
+
embedding_dim: Dimension of embeddings (1024 for BGE Large Medical)
|
29 |
+
metric: Distance metric ('angular' for cosine similarity, 'euclidean', 'manhattan', 'hamming', 'dot')
|
30 |
+
"""
|
31 |
+
self.embedding_dim = embedding_dim
|
32 |
+
self.metric = metric
|
33 |
+
self.tag_index = None
|
34 |
+
self.chunk_index = None
|
35 |
+
self.tag_to_id_mapping = {}
|
36 |
+
self.id_to_tag_mapping = {}
|
37 |
+
self.chunk_to_id_mapping = {}
|
38 |
+
self.id_to_chunk_mapping = {}
|
39 |
+
|
40 |
+
logger.info(f"Initialized AnnoyIndexManager: dim={embedding_dim}, metric={metric}")
|
41 |
+
|
42 |
+
def build_tag_index(self, tag_embeddings: Dict[str, np.ndarray], n_trees: int = 50) -> AnnoyIndex:
|
43 |
+
"""
|
44 |
+
Build ANNOY index for tag embeddings.
|
45 |
+
|
46 |
+
Args:
|
47 |
+
tag_embeddings: Dictionary mapping tags to their embeddings
|
48 |
+
n_trees: Number of trees (more trees = better precision, slower build)
|
49 |
+
|
50 |
+
Returns:
|
51 |
+
Built ANNOY index
|
52 |
+
"""
|
53 |
+
logger.info(f"Building tag ANNOY index with {len(tag_embeddings)} tags...")
|
54 |
+
|
55 |
+
# Create index
|
56 |
+
self.tag_index = AnnoyIndex(self.embedding_dim, self.metric)
|
57 |
+
|
58 |
+
# Create mappings
|
59 |
+
self.tag_to_id_mapping = {}
|
60 |
+
self.id_to_tag_mapping = {}
|
61 |
+
|
62 |
+
# Add embeddings to index
|
63 |
+
for tag_id, (tag, embedding) in enumerate(tag_embeddings.items()):
|
64 |
+
self.tag_index.add_item(tag_id, embedding)
|
65 |
+
self.tag_to_id_mapping[tag] = tag_id
|
66 |
+
self.id_to_tag_mapping[tag_id] = tag
|
67 |
+
|
68 |
+
# Build index
|
69 |
+
logger.info(f"Building index with {n_trees} trees...")
|
70 |
+
self.tag_index.build(n_trees)
|
71 |
+
|
72 |
+
logger.info(f"β
Tag ANNOY index built successfully: {len(tag_embeddings)} tags")
|
73 |
+
return self.tag_index
|
74 |
+
|
75 |
+
def build_chunk_index(self, chunk_embeddings: Dict[str, List[Dict]], n_trees: int = 50) -> AnnoyIndex:
|
76 |
+
"""
|
77 |
+
Build ANNOY index for chunk embeddings.
|
78 |
+
|
79 |
+
Args:
|
80 |
+
chunk_embeddings: Dictionary mapping document names to lists of chunk dictionaries
|
81 |
+
n_trees: Number of trees
|
82 |
+
|
83 |
+
Returns:
|
84 |
+
Built ANNOY index
|
85 |
+
"""
|
86 |
+
# Count total chunks
|
87 |
+
total_chunks = sum(len(chunks) for chunks in chunk_embeddings.values())
|
88 |
+
logger.info(f"Building chunk ANNOY index with {total_chunks} chunks...")
|
89 |
+
|
90 |
+
# Create index
|
91 |
+
self.chunk_index = AnnoyIndex(self.embedding_dim, self.metric)
|
92 |
+
|
93 |
+
# Create mappings
|
94 |
+
self.chunk_to_id_mapping = {}
|
95 |
+
self.id_to_chunk_mapping = {}
|
96 |
+
|
97 |
+
chunk_id = 0
|
98 |
+
for doc_name, chunks in chunk_embeddings.items():
|
99 |
+
for chunk in chunks:
|
100 |
+
# Create unique chunk identifier
|
101 |
+
chunk_key = f"{doc_name}#{chunk['chunk_id']}"
|
102 |
+
|
103 |
+
# Add to index
|
104 |
+
self.chunk_index.add_item(chunk_id, chunk['embedding'])
|
105 |
+
|
106 |
+
# Create mappings
|
107 |
+
self.chunk_to_id_mapping[chunk_key] = chunk_id
|
108 |
+
self.id_to_chunk_mapping[chunk_id] = {
|
109 |
+
'document': doc_name,
|
110 |
+
'chunk_id': chunk['chunk_id'],
|
111 |
+
'text': chunk['text'],
|
112 |
+
'start_char': chunk.get('start_char', 0),
|
113 |
+
'end_char': chunk.get('end_char', len(chunk['text'])),
|
114 |
+
'token_count': chunk.get('token_count', len(chunk['text'].split())),
|
115 |
+
'chunk_key': chunk_key
|
116 |
+
}
|
117 |
+
|
118 |
+
chunk_id += 1
|
119 |
+
|
120 |
+
# Build index
|
121 |
+
logger.info(f"Building chunk index with {n_trees} trees...")
|
122 |
+
self.chunk_index.build(n_trees)
|
123 |
+
|
124 |
+
logger.info(f"β
Chunk ANNOY index built successfully: {total_chunks} chunks")
|
125 |
+
return self.chunk_index
|
126 |
+
|
127 |
+
def save_indices(self, output_dir: Union[str, Path]):
|
128 |
+
"""
|
129 |
+
Save ANNOY indices and mappings to disk.
|
130 |
+
|
131 |
+
Args:
|
132 |
+
output_dir: Directory to save indices
|
133 |
+
"""
|
134 |
+
output_dir = Path(output_dir)
|
135 |
+
# Save indices at the same level as embeddings, not inside embeddings
|
136 |
+
indices_dir = output_dir.parent / 'indices'
|
137 |
+
indices_dir.mkdir(exist_ok=True)
|
138 |
+
|
139 |
+
# Save tag index
|
140 |
+
if self.tag_index is not None:
|
141 |
+
tag_index_path = indices_dir / 'tag_embeddings.ann'
|
142 |
+
self.tag_index.save(str(tag_index_path))
|
143 |
+
|
144 |
+
# Save tag mappings
|
145 |
+
tag_mappings_path = indices_dir / 'tag_mappings.json'
|
146 |
+
with open(tag_mappings_path, 'w', encoding='utf-8') as f:
|
147 |
+
json.dump({
|
148 |
+
'tag_to_id': self.tag_to_id_mapping,
|
149 |
+
'id_to_tag': self.id_to_tag_mapping
|
150 |
+
}, f, indent=2, ensure_ascii=False)
|
151 |
+
|
152 |
+
logger.info(f"β
Tag index saved: {tag_index_path}")
|
153 |
+
|
154 |
+
# Save chunk index
|
155 |
+
if self.chunk_index is not None:
|
156 |
+
chunk_index_path = indices_dir / 'chunk_embeddings.ann'
|
157 |
+
self.chunk_index.save(str(chunk_index_path))
|
158 |
+
|
159 |
+
# Save chunk mappings
|
160 |
+
chunk_mappings_path = indices_dir / 'chunk_mappings.json'
|
161 |
+
with open(chunk_mappings_path, 'w', encoding='utf-8') as f:
|
162 |
+
json.dump({
|
163 |
+
'chunk_to_id': self.chunk_to_id_mapping,
|
164 |
+
'id_to_chunk': self.id_to_chunk_mapping
|
165 |
+
}, f, indent=2, ensure_ascii=False)
|
166 |
+
|
167 |
+
logger.info(f"β
Chunk index saved: {chunk_index_path}")
|
168 |
+
|
169 |
+
# Save index metadata
|
170 |
+
metadata_path = indices_dir / 'annoy_metadata.json'
|
171 |
+
with open(metadata_path, 'w', encoding='utf-8') as f:
|
172 |
+
json.dump({
|
173 |
+
'embedding_dim': self.embedding_dim,
|
174 |
+
'metric': self.metric,
|
175 |
+
'tag_index_exists': self.tag_index is not None,
|
176 |
+
'chunk_index_exists': self.chunk_index is not None,
|
177 |
+
'num_tags': len(self.tag_to_id_mapping),
|
178 |
+
'num_chunks': len(self.chunk_to_id_mapping)
|
179 |
+
}, f, indent=2)
|
180 |
+
|
181 |
+
logger.info(f"β
ANNOY indices saved to: {indices_dir}")
|
182 |
+
|
183 |
+
def load_indices(self, input_dir: Union[str, Path]) -> bool:
|
184 |
+
"""
|
185 |
+
Load ANNOY indices and mappings from disk.
|
186 |
+
|
187 |
+
Args:
|
188 |
+
input_dir: Directory containing saved indices
|
189 |
+
|
190 |
+
Returns:
|
191 |
+
True if successfully loaded, False otherwise
|
192 |
+
"""
|
193 |
+
input_dir = Path(input_dir)
|
194 |
+
# Load indices from the same level as embeddings, not inside embeddings
|
195 |
+
indices_dir = input_dir.parent / 'indices'
|
196 |
+
|
197 |
+
if not indices_dir.exists():
|
198 |
+
logger.warning(f"Indices directory not found: {indices_dir}")
|
199 |
+
return False
|
200 |
+
|
201 |
+
try:
|
202 |
+
# Load metadata
|
203 |
+
metadata_path = indices_dir / 'annoy_metadata.json'
|
204 |
+
if metadata_path.exists():
|
205 |
+
with open(metadata_path, 'r', encoding='utf-8') as f:
|
206 |
+
metadata = json.load(f)
|
207 |
+
self.embedding_dim = metadata['embedding_dim']
|
208 |
+
self.metric = metadata['metric']
|
209 |
+
logger.info(f"Loaded metadata: dim={self.embedding_dim}, metric={self.metric}")
|
210 |
+
|
211 |
+
# Load tag index
|
212 |
+
tag_index_path = indices_dir / 'tag_embeddings.ann'
|
213 |
+
tag_mappings_path = indices_dir / 'tag_mappings.json'
|
214 |
+
|
215 |
+
if tag_index_path.exists() and tag_mappings_path.exists():
|
216 |
+
self.tag_index = AnnoyIndex(self.embedding_dim, self.metric)
|
217 |
+
self.tag_index.load(str(tag_index_path))
|
218 |
+
|
219 |
+
with open(tag_mappings_path, 'r', encoding='utf-8') as f:
|
220 |
+
mappings = json.load(f)
|
221 |
+
self.tag_to_id_mapping = mappings['tag_to_id']
|
222 |
+
self.id_to_tag_mapping = {int(k): v for k, v in mappings['id_to_tag'].items()}
|
223 |
+
|
224 |
+
logger.info(f"β
Tag index loaded: {len(self.tag_to_id_mapping)} tags")
|
225 |
+
|
226 |
+
# Load chunk index
|
227 |
+
chunk_index_path = indices_dir / 'chunk_embeddings.ann'
|
228 |
+
chunk_mappings_path = indices_dir / 'chunk_mappings.json'
|
229 |
+
|
230 |
+
if chunk_index_path.exists() and chunk_mappings_path.exists():
|
231 |
+
self.chunk_index = AnnoyIndex(self.embedding_dim, self.metric)
|
232 |
+
self.chunk_index.load(str(chunk_index_path))
|
233 |
+
|
234 |
+
with open(chunk_mappings_path, 'r', encoding='utf-8') as f:
|
235 |
+
mappings = json.load(f)
|
236 |
+
self.chunk_to_id_mapping = mappings['chunk_to_id']
|
237 |
+
self.id_to_chunk_mapping = {int(k): v for k, v in mappings['id_to_chunk'].items()}
|
238 |
+
|
239 |
+
logger.info(f"β
Chunk index loaded: {len(self.chunk_to_id_mapping)} chunks")
|
240 |
+
|
241 |
+
return True
|
242 |
+
|
243 |
+
except Exception as e:
|
244 |
+
logger.error(f"Failed to load ANNOY indices: {e}")
|
245 |
+
return False
|
246 |
+
|
247 |
+
def search_tags(self, query_embedding: np.ndarray, n_neighbors: int = 10,
|
248 |
+
include_distances: bool = True) -> Union[List[str], Tuple[List[str], List[float]]]:
|
249 |
+
"""
|
250 |
+
Search for similar tags using ANNOY index.
|
251 |
+
|
252 |
+
Args:
|
253 |
+
query_embedding: Query embedding vector
|
254 |
+
n_neighbors: Number of nearest neighbors to return
|
255 |
+
include_distances: Whether to return distances
|
256 |
+
|
257 |
+
Returns:
|
258 |
+
List of tag names, or tuple of (tag_names, distances)
|
259 |
+
"""
|
260 |
+
if self.tag_index is None:
|
261 |
+
raise ValueError("Tag index not built or loaded")
|
262 |
+
|
263 |
+
# Search using ANNOY
|
264 |
+
if include_distances:
|
265 |
+
neighbor_ids, distances = self.tag_index.get_nns_by_vector(
|
266 |
+
query_embedding, n_neighbors, include_distances=True
|
267 |
+
)
|
268 |
+
else:
|
269 |
+
neighbor_ids = self.tag_index.get_nns_by_vector(
|
270 |
+
query_embedding, n_neighbors, include_distances=False
|
271 |
+
)
|
272 |
+
|
273 |
+
# Convert IDs to tag names
|
274 |
+
tag_names = [self.id_to_tag_mapping[neighbor_id] for neighbor_id in neighbor_ids]
|
275 |
+
|
276 |
+
if include_distances:
|
277 |
+
return tag_names, distances
|
278 |
+
else:
|
279 |
+
return tag_names
|
280 |
+
|
281 |
+
def search_chunks(self, query_embedding: np.ndarray, n_neighbors: int = 10,
|
282 |
+
include_distances: bool = True) -> Union[List[Dict], Tuple[List[Dict], List[float]]]:
|
283 |
+
"""
|
284 |
+
Search for similar chunks using ANNOY index.
|
285 |
+
|
286 |
+
Args:
|
287 |
+
query_embedding: Query embedding vector
|
288 |
+
n_neighbors: Number of nearest neighbors to return
|
289 |
+
include_distances: Whether to return distances
|
290 |
+
|
291 |
+
Returns:
|
292 |
+
List of chunk dictionaries, or tuple of (chunks, distances)
|
293 |
+
"""
|
294 |
+
if self.chunk_index is None:
|
295 |
+
raise ValueError("Chunk index not built or loaded")
|
296 |
+
|
297 |
+
# Search using ANNOY
|
298 |
+
if include_distances:
|
299 |
+
neighbor_ids, distances = self.chunk_index.get_nns_by_vector(
|
300 |
+
query_embedding, n_neighbors, include_distances=True
|
301 |
+
)
|
302 |
+
else:
|
303 |
+
neighbor_ids = self.chunk_index.get_nns_by_vector(
|
304 |
+
query_embedding, n_neighbors, include_distances=False
|
305 |
+
)
|
306 |
+
|
307 |
+
# Convert IDs to chunk info
|
308 |
+
chunks = [self.id_to_chunk_mapping[neighbor_id] for neighbor_id in neighbor_ids]
|
309 |
+
|
310 |
+
if include_distances:
|
311 |
+
return chunks, distances
|
312 |
+
else:
|
313 |
+
return chunks
|
314 |
+
|
315 |
+
def search_chunks_in_documents(self, query_embedding: np.ndarray,
|
316 |
+
document_names: List[str], n_neighbors: int = 10,
|
317 |
+
include_distances: bool = True) -> Union[List[Dict], Tuple[List[Dict], List[float]]]:
|
318 |
+
"""
|
319 |
+
Search for similar chunks within specific documents.
|
320 |
+
|
321 |
+
Args:
|
322 |
+
query_embedding: Query embedding vector
|
323 |
+
document_names: List of document names to search within
|
324 |
+
n_neighbors: Number of nearest neighbors to return
|
325 |
+
include_distances: Whether to return distances
|
326 |
+
|
327 |
+
Returns:
|
328 |
+
List of chunk dictionaries, or tuple of (chunks, distances)
|
329 |
+
"""
|
330 |
+
if self.chunk_index is None:
|
331 |
+
raise ValueError("Chunk index not built or loaded")
|
332 |
+
|
333 |
+
# Get more candidates than needed since we'll filter by document
|
334 |
+
search_candidates = min(n_neighbors * 5, len(self.id_to_chunk_mapping))
|
335 |
+
|
336 |
+
# Search using ANNOY
|
337 |
+
if include_distances:
|
338 |
+
candidate_ids, distances = self.chunk_index.get_nns_by_vector(
|
339 |
+
query_embedding, search_candidates, include_distances=True
|
340 |
+
)
|
341 |
+
else:
|
342 |
+
candidate_ids = self.chunk_index.get_nns_by_vector(
|
343 |
+
query_embedding, search_candidates, include_distances=False
|
344 |
+
)
|
345 |
+
|
346 |
+
# Filter by document names and take top n_neighbors
|
347 |
+
filtered_chunks = []
|
348 |
+
filtered_distances = [] if include_distances else None
|
349 |
+
|
350 |
+
for i, candidate_id in enumerate(candidate_ids):
|
351 |
+
chunk_info = self.id_to_chunk_mapping[candidate_id]
|
352 |
+
if chunk_info['document'] in document_names:
|
353 |
+
filtered_chunks.append(chunk_info)
|
354 |
+
if include_distances:
|
355 |
+
filtered_distances.append(distances[i])
|
356 |
+
|
357 |
+
if len(filtered_chunks) >= n_neighbors:
|
358 |
+
break
|
359 |
+
|
360 |
+
if include_distances:
|
361 |
+
return filtered_chunks, filtered_distances
|
362 |
+
else:
|
363 |
+
return filtered_chunks
|
364 |
+
|
365 |
+
def get_index_stats(self) -> Dict:
|
366 |
+
"""Get statistics about the loaded indices."""
|
367 |
+
stats = {
|
368 |
+
'embedding_dim': self.embedding_dim,
|
369 |
+
'metric': self.metric,
|
370 |
+
'tag_index_loaded': self.tag_index is not None,
|
371 |
+
'chunk_index_loaded': self.chunk_index is not None,
|
372 |
+
'num_tags': len(self.tag_to_id_mapping) if self.tag_index else 0,
|
373 |
+
'num_chunks': len(self.chunk_to_id_mapping) if self.chunk_index else 0
|
374 |
+
}
|
375 |
+
return stats
|
376 |
+
|
377 |
+
|
378 |
+
def convert_angular_distance_to_cosine_similarity(angular_distance: float) -> float:
|
379 |
+
"""
|
380 |
+
Convert ANNOY angular distance to cosine similarity.
|
381 |
+
|
382 |
+
Args:
|
383 |
+
angular_distance: Angular distance from ANNOY
|
384 |
+
|
385 |
+
Returns:
|
386 |
+
Cosine similarity (0 to 1)
|
387 |
+
"""
|
388 |
+
# Angular distance is related to cosine similarity by:
|
389 |
+
# angular_distance = 2 * arccos(cosine_similarity) / Ο
|
390 |
+
# Therefore: cosine_similarity = cos(angular_distance * Ο / 2)
|
391 |
+
import math
|
392 |
+
return math.cos(angular_distance * math.pi / 2)
|
{src/pdf-version β customization/src}/indexing/document_indexer.py
RENAMED
File without changes
|
{src/pdf-version β customization/src}/indexing/embedding_creator.py
RENAMED
File without changes
|
{src/pdf-version β customization/src}/indexing/storage.py
RENAMED
@@ -2,13 +2,19 @@
|
|
2 |
|
3 |
import json
|
4 |
import os
|
|
|
5 |
from typing import Dict, Optional, Tuple
|
6 |
import numpy as np
|
|
|
|
|
|
|
|
|
|
|
7 |
|
8 |
|
9 |
def save_document_system(document_index: Dict, tag_embeddings: Dict,
|
10 |
doc_tag_mapping: Dict, chunk_embeddings: Dict = None,
|
11 |
-
output_dir: str = None):
|
12 |
"""Save the complete document indexing system.
|
13 |
|
14 |
Args:
|
@@ -85,6 +91,31 @@ def save_document_system(document_index: Dict, tag_embeddings: Dict,
|
|
85 |
with open(os.path.join(output_dir, 'chunk_embeddings.json'), 'w', encoding='utf-8') as f:
|
86 |
json.dump(chunk_embeddings_serializable, f, indent=2, ensure_ascii=False)
|
87 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
88 |
print("β
Document system saved to files")
|
89 |
|
90 |
|
@@ -161,4 +192,62 @@ def load_document_system(input_dir: str = None) -> Tuple[Optional[Dict], Optiona
|
|
161 |
|
162 |
except Exception as e:
|
163 |
print(f"β Failed to load document system: {e}")
|
164 |
-
return None, None, None, None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
|
3 |
import json
|
4 |
import os
|
5 |
+
import logging
|
6 |
from typing import Dict, Optional, Tuple
|
7 |
import numpy as np
|
8 |
+
from .annoy_manager import AnnoyIndexManager
|
9 |
+
|
10 |
+
# Configure logging
|
11 |
+
logging.basicConfig(level=logging.INFO)
|
12 |
+
logger = logging.getLogger(__name__)
|
13 |
|
14 |
|
15 |
def save_document_system(document_index: Dict, tag_embeddings: Dict,
|
16 |
doc_tag_mapping: Dict, chunk_embeddings: Dict = None,
|
17 |
+
output_dir: str = None, build_annoy_indices: bool = True):
|
18 |
"""Save the complete document indexing system.
|
19 |
|
20 |
Args:
|
|
|
91 |
with open(os.path.join(output_dir, 'chunk_embeddings.json'), 'w', encoding='utf-8') as f:
|
92 |
json.dump(chunk_embeddings_serializable, f, indent=2, ensure_ascii=False)
|
93 |
|
94 |
+
# Build and save ANNOY indices if requested
|
95 |
+
if build_annoy_indices:
|
96 |
+
logger.info("π§ Building ANNOY indices for fast retrieval...")
|
97 |
+
try:
|
98 |
+
# Initialize ANNOY manager (assuming BGE Large Medical embedding dimension)
|
99 |
+
annoy_manager = AnnoyIndexManager(embedding_dim=1024, metric='angular')
|
100 |
+
|
101 |
+
# Build tag index
|
102 |
+
logger.info("Building tag ANNOY index...")
|
103 |
+
annoy_manager.build_tag_index(tag_embeddings, n_trees=50)
|
104 |
+
|
105 |
+
# Build chunk index if chunk embeddings are provided
|
106 |
+
if chunk_embeddings:
|
107 |
+
logger.info("Building chunk ANNOY index...")
|
108 |
+
annoy_manager.build_chunk_index(chunk_embeddings, n_trees=50)
|
109 |
+
|
110 |
+
# Save indices
|
111 |
+
logger.info("Saving ANNOY indices...")
|
112 |
+
annoy_manager.save_indices(output_dir)
|
113 |
+
|
114 |
+
logger.info("β
ANNOY indices built and saved successfully")
|
115 |
+
except Exception as e:
|
116 |
+
logger.error(f"β Failed to build ANNOY indices: {e}")
|
117 |
+
logger.warning("Continuing without ANNOY indices - will use original search methods")
|
118 |
+
|
119 |
print("β
Document system saved to files")
|
120 |
|
121 |
|
|
|
192 |
|
193 |
except Exception as e:
|
194 |
print(f"β Failed to load document system: {e}")
|
195 |
+
return None, None, None, None
|
196 |
+
|
197 |
+
|
198 |
+
def load_annoy_manager(input_dir: str = None) -> Optional[AnnoyIndexManager]:
|
199 |
+
"""
|
200 |
+
Load ANNOY index manager with pre-built indices.
|
201 |
+
|
202 |
+
Args:
|
203 |
+
input_dir: Input directory containing saved indices
|
204 |
+
|
205 |
+
Returns:
|
206 |
+
AnnoyIndexManager instance or None if loading fails
|
207 |
+
"""
|
208 |
+
if input_dir is None:
|
209 |
+
# Get project root directory
|
210 |
+
from pathlib import Path
|
211 |
+
root_dir = Path(__file__).parent.parent.parent.parent
|
212 |
+
input_dir = root_dir / 'embeddings' / 'pdfembeddings'
|
213 |
+
|
214 |
+
try:
|
215 |
+
# Initialize ANNOY manager
|
216 |
+
annoy_manager = AnnoyIndexManager(embedding_dim=1024, metric='angular')
|
217 |
+
|
218 |
+
# Try to load indices
|
219 |
+
if annoy_manager.load_indices(input_dir):
|
220 |
+
logger.info("β
ANNOY indices loaded successfully")
|
221 |
+
return annoy_manager
|
222 |
+
else:
|
223 |
+
logger.warning("β οΈ Failed to load ANNOY indices")
|
224 |
+
return None
|
225 |
+
|
226 |
+
except Exception as e:
|
227 |
+
logger.error(f"β Failed to initialize ANNOY manager: {e}")
|
228 |
+
return None
|
229 |
+
|
230 |
+
|
231 |
+
def load_document_system_with_annoy(input_dir: str = None, annoy_dir: str = None) -> Tuple[Optional[Dict], Optional[Dict], Optional[Dict], Optional[Dict], Optional[AnnoyIndexManager]]:
|
232 |
+
"""
|
233 |
+
Load the complete document indexing system including ANNOY indices.
|
234 |
+
|
235 |
+
Args:
|
236 |
+
input_dir: Input directory containing saved files
|
237 |
+
annoy_dir: Directory containing ANNOY indices (if different from input_dir)
|
238 |
+
|
239 |
+
Returns:
|
240 |
+
Tuple of (document_index, tag_embeddings, doc_tag_mapping, chunk_embeddings, annoy_manager).
|
241 |
+
Returns all None values if loading fails.
|
242 |
+
"""
|
243 |
+
# Load the standard document system
|
244 |
+
document_index, tag_embeddings, doc_tag_mapping, chunk_embeddings = load_document_system(input_dir)
|
245 |
+
|
246 |
+
if document_index is None:
|
247 |
+
return None, None, None, None, None
|
248 |
+
|
249 |
+
# Load ANNOY manager
|
250 |
+
# Use annoy_dir if provided, otherwise use input_dir
|
251 |
+
annoy_manager = load_annoy_manager(annoy_dir if annoy_dir else input_dir)
|
252 |
+
|
253 |
+
return document_index, tag_embeddings, doc_tag_mapping, chunk_embeddings, annoy_manager
|
{src/pdf-version β customization/src}/models/__init__.py
RENAMED
File without changes
|
{src/pdf-version β customization/src}/models/embedding_models.py
RENAMED
File without changes
|
{src/pdf-version β customization/src}/rag/__init__.py
RENAMED
File without changes
|
{src/pdf-version β customization/src}/rag/medical_rag_pipeline.py
RENAMED
File without changes
|
{src/pdf-version β customization/src}/retrieval/__init__.py
RENAMED
File without changes
|
{src/pdf-version β customization/src}/retrieval/chunk_retriever.py
RENAMED
@@ -1,9 +1,15 @@
|
|
1 |
"""Chunk-level retrieval functionality."""
|
2 |
|
3 |
-
from typing import List, Dict, Callable
|
4 |
import numpy as np
|
|
|
5 |
from sentence_transformers import SentenceTransformer
|
6 |
-
from
|
|
|
|
|
|
|
|
|
|
|
7 |
|
8 |
|
9 |
def cosine_similarity(vec1: np.ndarray, vec2: np.ndarray) -> float:
|
@@ -190,4 +196,172 @@ def get_chunks_for_rag(relevant_chunks: List[Dict], max_chunks: int = 10) -> Lis
|
|
190 |
rag_chunks.append(formatted_chunk)
|
191 |
|
192 |
print(f"π Retrieved {len(rag_chunks)} chunks for RAG")
|
193 |
-
return rag_chunks
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
"""Chunk-level retrieval functionality."""
|
2 |
|
3 |
+
from typing import List, Dict, Callable, Optional
|
4 |
import numpy as np
|
5 |
+
import logging
|
6 |
from sentence_transformers import SentenceTransformer
|
7 |
+
from indexing.embedding_creator import create_text_embedding
|
8 |
+
from indexing.annoy_manager import AnnoyIndexManager, convert_angular_distance_to_cosine_similarity
|
9 |
+
|
10 |
+
# Configure logging
|
11 |
+
logging.basicConfig(level=logging.INFO)
|
12 |
+
logger = logging.getLogger(__name__)
|
13 |
|
14 |
|
15 |
def cosine_similarity(vec1: np.ndarray, vec2: np.ndarray) -> float:
|
|
|
196 |
rag_chunks.append(formatted_chunk)
|
197 |
|
198 |
print(f"π Retrieved {len(rag_chunks)} chunks for RAG")
|
199 |
+
return rag_chunks
|
200 |
+
|
201 |
+
|
202 |
+
# ANNOY-accelerated chunk retrieval functions
|
203 |
+
|
204 |
+
def find_relevant_chunks_annoy_top_k(query: str, model: SentenceTransformer,
|
205 |
+
relevant_docs: List[str], annoy_manager: AnnoyIndexManager,
|
206 |
+
top_chunks_per_doc: int = 3,
|
207 |
+
similarity_metric: str = "angular") -> List[Dict]:
|
208 |
+
"""Find most relevant chunks using ANNOY index and Top-K strategy."""
|
209 |
+
query_embedding = create_text_embedding(model, query)
|
210 |
+
|
211 |
+
# Use ANNOY to search chunks in the relevant documents
|
212 |
+
all_chunks, distances = annoy_manager.search_chunks_in_documents(
|
213 |
+
query_embedding, relevant_docs,
|
214 |
+
n_neighbors=len(relevant_docs) * top_chunks_per_doc,
|
215 |
+
include_distances=True
|
216 |
+
)
|
217 |
+
|
218 |
+
# Convert distances to similarities and format results
|
219 |
+
all_relevant_chunks = []
|
220 |
+
for chunk, distance in zip(all_chunks, distances):
|
221 |
+
similarity = convert_angular_distance_to_cosine_similarity(distance)
|
222 |
+
|
223 |
+
chunk_result = {
|
224 |
+
'document': chunk['document'],
|
225 |
+
'chunk_id': chunk['chunk_id'],
|
226 |
+
'text': chunk['text'],
|
227 |
+
'start_char': chunk.get('start_char', 0),
|
228 |
+
'end_char': chunk.get('end_char', len(chunk['text'])),
|
229 |
+
'token_count': chunk.get('token_count', len(chunk['text'].split())),
|
230 |
+
'similarity': similarity
|
231 |
+
}
|
232 |
+
all_relevant_chunks.append(chunk_result)
|
233 |
+
|
234 |
+
# Group by document and take top chunks per document
|
235 |
+
doc_chunks = {}
|
236 |
+
for chunk in all_relevant_chunks:
|
237 |
+
doc_name = chunk['document']
|
238 |
+
if doc_name not in doc_chunks:
|
239 |
+
doc_chunks[doc_name] = []
|
240 |
+
doc_chunks[doc_name].append(chunk)
|
241 |
+
|
242 |
+
# Take top chunks from each document
|
243 |
+
final_chunks = []
|
244 |
+
for doc_name in relevant_docs:
|
245 |
+
if doc_name in doc_chunks:
|
246 |
+
doc_chunks[doc_name].sort(key=lambda x: x['similarity'], reverse=True)
|
247 |
+
final_chunks.extend(doc_chunks[doc_name][:top_chunks_per_doc])
|
248 |
+
|
249 |
+
# Sort all chunks by similarity
|
250 |
+
final_chunks.sort(key=lambda x: x['similarity'], reverse=True)
|
251 |
+
|
252 |
+
logger.info(f"π Found {len(final_chunks)} relevant chunks (ANNOY Top-K)")
|
253 |
+
for i, chunk in enumerate(final_chunks[:5]): # Show top 5
|
254 |
+
logger.info(f" {i+1}. {chunk['document']} (chunk {chunk['chunk_id']}, similarity: {chunk['similarity']:.3f})")
|
255 |
+
logger.info(f" Preview: {chunk['text'][:100]}...")
|
256 |
+
|
257 |
+
return final_chunks
|
258 |
+
|
259 |
+
|
260 |
+
def find_relevant_chunks_annoy_top_p(query: str, model: SentenceTransformer,
|
261 |
+
relevant_docs: List[str], annoy_manager: AnnoyIndexManager,
|
262 |
+
top_p: float = 0.6, min_similarity: float = 0.3,
|
263 |
+
similarity_metric: str = "angular") -> List[Dict]:
|
264 |
+
"""Find most relevant chunks using ANNOY index and Top-P strategy."""
|
265 |
+
query_embedding = create_text_embedding(model, query)
|
266 |
+
|
267 |
+
# Search more chunks to ensure we have enough candidates for Top-P selection
|
268 |
+
search_candidates = min(len(relevant_docs) * 10, 100) # Reasonable upper limit
|
269 |
+
|
270 |
+
# Use ANNOY to search chunks in the relevant documents
|
271 |
+
all_chunks, distances = annoy_manager.search_chunks_in_documents(
|
272 |
+
query_embedding, relevant_docs,
|
273 |
+
n_neighbors=search_candidates,
|
274 |
+
include_distances=True
|
275 |
+
)
|
276 |
+
|
277 |
+
# Convert distances to similarities and filter by minimum similarity
|
278 |
+
filtered_chunks = []
|
279 |
+
for chunk, distance in zip(all_chunks, distances):
|
280 |
+
similarity = convert_angular_distance_to_cosine_similarity(distance)
|
281 |
+
|
282 |
+
# Only include chunks above minimum similarity threshold
|
283 |
+
if similarity >= min_similarity:
|
284 |
+
chunk_result = {
|
285 |
+
'document': chunk['document'],
|
286 |
+
'chunk_id': chunk['chunk_id'],
|
287 |
+
'text': chunk['text'],
|
288 |
+
'start_char': chunk.get('start_char', 0),
|
289 |
+
'end_char': chunk.get('end_char', len(chunk['text'])),
|
290 |
+
'token_count': chunk.get('token_count', len(chunk['text'].split())),
|
291 |
+
'similarity': similarity
|
292 |
+
}
|
293 |
+
filtered_chunks.append(chunk_result)
|
294 |
+
|
295 |
+
if not filtered_chunks:
|
296 |
+
logger.warning(f"β οΈ No chunks found above similarity threshold {min_similarity}")
|
297 |
+
return []
|
298 |
+
|
299 |
+
# Sort by similarity
|
300 |
+
filtered_chunks.sort(key=lambda x: x['similarity'], reverse=True)
|
301 |
+
|
302 |
+
# Apply Top-P selection
|
303 |
+
total_score = sum(chunk['similarity'] for chunk in filtered_chunks)
|
304 |
+
cumulative_prob = 0.0
|
305 |
+
selected_chunks = []
|
306 |
+
|
307 |
+
for chunk in filtered_chunks:
|
308 |
+
prob = chunk['similarity'] / total_score
|
309 |
+
cumulative_prob += prob
|
310 |
+
selected_chunks.append(chunk)
|
311 |
+
|
312 |
+
# Stop when we reach the Top-P threshold
|
313 |
+
if cumulative_prob >= top_p:
|
314 |
+
break
|
315 |
+
|
316 |
+
logger.info(f"π Found {len(selected_chunks)} relevant chunks (ANNOY Top-P={top_p})")
|
317 |
+
logger.info(f"π Filtered from {len(filtered_chunks)} chunks above threshold")
|
318 |
+
logger.info(f"π Cumulative probability: {cumulative_prob:.3f}")
|
319 |
+
|
320 |
+
for i, chunk in enumerate(selected_chunks[:5]): # Show top 5
|
321 |
+
logger.info(f" {i+1}. {chunk['document']} (chunk {chunk['chunk_id']}, similarity: {chunk['similarity']:.3f})")
|
322 |
+
logger.info(f" Preview: {chunk['text'][:100]}...")
|
323 |
+
|
324 |
+
return selected_chunks
|
325 |
+
|
326 |
+
|
327 |
+
def find_relevant_chunks_annoy(query: str, model: SentenceTransformer,
|
328 |
+
relevant_docs: List[str], annoy_manager: AnnoyIndexManager,
|
329 |
+
strategy: str = "top_p", **kwargs) -> List[Dict]:
|
330 |
+
"""Unified interface for ANNOY-accelerated chunk retrieval with different strategies."""
|
331 |
+
|
332 |
+
similarity_metric = kwargs.get("similarity_metric", "angular")
|
333 |
+
|
334 |
+
if strategy == "top_k":
|
335 |
+
top_chunks_per_doc = kwargs.get("top_chunks_per_doc", 3)
|
336 |
+
return find_relevant_chunks_annoy_top_k(query, model, relevant_docs, annoy_manager,
|
337 |
+
top_chunks_per_doc, similarity_metric)
|
338 |
+
|
339 |
+
elif strategy == "top_p":
|
340 |
+
top_p = kwargs.get("top_p", 0.6)
|
341 |
+
min_similarity = kwargs.get("min_similarity", 0.3)
|
342 |
+
return find_relevant_chunks_annoy_top_p(query, model, relevant_docs, annoy_manager,
|
343 |
+
top_p, min_similarity, similarity_metric)
|
344 |
+
|
345 |
+
else:
|
346 |
+
raise ValueError(f"Unknown strategy: {strategy}. Use 'top_k' or 'top_p'")
|
347 |
+
|
348 |
+
|
349 |
+
def find_relevant_chunks_with_fallback(query: str, model: SentenceTransformer,
|
350 |
+
relevant_docs: List[str], chunk_embeddings: Dict,
|
351 |
+
annoy_manager: Optional[AnnoyIndexManager] = None,
|
352 |
+
strategy: str = "top_p", **kwargs) -> List[Dict]:
|
353 |
+
"""
|
354 |
+
Find relevant chunks with ANNOY acceleration and fallback to original method.
|
355 |
+
|
356 |
+
This function automatically uses ANNOY if available, otherwise falls back to original search.
|
357 |
+
"""
|
358 |
+
if annoy_manager is not None:
|
359 |
+
try:
|
360 |
+
logger.info("π Using ANNOY-accelerated chunk retrieval")
|
361 |
+
return find_relevant_chunks_annoy(query, model, relevant_docs, annoy_manager, strategy, **kwargs)
|
362 |
+
except Exception as e:
|
363 |
+
logger.warning(f"β οΈ ANNOY chunk retrieval failed, falling back to original method: {e}")
|
364 |
+
|
365 |
+
# Fallback to original method
|
366 |
+
logger.info("π Using original chunk retrieval method")
|
367 |
+
return find_relevant_chunks(query, model, relevant_docs, chunk_embeddings, strategy, **kwargs)
|
{src/pdf-version β customization/src}/retrieval/document_retriever.py
RENAMED
@@ -1,9 +1,15 @@
|
|
1 |
"""Document retrieval strategies and functionality."""
|
2 |
|
3 |
-
from typing import List, Dict
|
4 |
import numpy as np
|
|
|
5 |
from sentence_transformers import SentenceTransformer
|
6 |
-
from
|
|
|
|
|
|
|
|
|
|
|
7 |
|
8 |
|
9 |
def find_relevant_documents_top_k(query: str, model: SentenceTransformer,
|
@@ -189,4 +195,202 @@ def create_document_tag_mapping(document_index: Dict, tag_embeddings: Dict) -> D
|
|
189 |
'treatments': doc_info.get('treatments', [])
|
190 |
}
|
191 |
|
192 |
-
return doc_tag_mapping
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
"""Document retrieval strategies and functionality."""
|
2 |
|
3 |
+
from typing import List, Dict, Optional
|
4 |
import numpy as np
|
5 |
+
import logging
|
6 |
from sentence_transformers import SentenceTransformer
|
7 |
+
from indexing.embedding_creator import create_text_embedding
|
8 |
+
from indexing.annoy_manager import AnnoyIndexManager, convert_angular_distance_to_cosine_similarity
|
9 |
+
|
10 |
+
# Configure logging
|
11 |
+
logging.basicConfig(level=logging.INFO)
|
12 |
+
logger = logging.getLogger(__name__)
|
13 |
|
14 |
|
15 |
def find_relevant_documents_top_k(query: str, model: SentenceTransformer,
|
|
|
195 |
'treatments': doc_info.get('treatments', [])
|
196 |
}
|
197 |
|
198 |
+
return doc_tag_mapping
|
199 |
+
|
200 |
+
|
201 |
+
# ANNOY-accelerated document retrieval functions
|
202 |
+
|
203 |
+
def find_relevant_documents_annoy_top_k(query: str, model: SentenceTransformer,
|
204 |
+
annoy_manager: AnnoyIndexManager, doc_tag_mapping: Dict,
|
205 |
+
top_k: int = 3, search_neighbors: int = 20) -> List[str]:
|
206 |
+
"""Find top-k most relevant documents using ANNOY index for fast tag search."""
|
207 |
+
query_embedding = create_text_embedding(model, query)
|
208 |
+
|
209 |
+
# Use ANNOY to find similar tags quickly
|
210 |
+
similar_tags, distances = annoy_manager.search_tags(
|
211 |
+
query_embedding, n_neighbors=search_neighbors, include_distances=True
|
212 |
+
)
|
213 |
+
|
214 |
+
# Convert angular distances to cosine similarities
|
215 |
+
tag_similarities = {}
|
216 |
+
for tag, distance in zip(similar_tags, distances):
|
217 |
+
similarity = convert_angular_distance_to_cosine_similarity(distance)
|
218 |
+
tag_similarities[tag] = similarity
|
219 |
+
|
220 |
+
# Find documents that contain the most similar tags
|
221 |
+
doc_scores = {}
|
222 |
+
for pdf_name, doc_info in doc_tag_mapping.items():
|
223 |
+
doc_tags = doc_info['tags']
|
224 |
+
|
225 |
+
# Calculate document score using max similarity for precise tag matching
|
226 |
+
if doc_tags:
|
227 |
+
similarities = [tag_similarities.get(tag, 0) for tag in doc_tags]
|
228 |
+
# Use max similarity to find documents with best tag matches
|
229 |
+
doc_score = max(similarities)
|
230 |
+
doc_scores[pdf_name] = doc_score
|
231 |
+
|
232 |
+
# Sort and return top-k documents
|
233 |
+
sorted_docs = sorted(doc_scores.items(), key=lambda x: x[1], reverse=True)
|
234 |
+
relevant_docs = [doc_name for doc_name, score in sorted_docs[:top_k]]
|
235 |
+
|
236 |
+
logger.info(f"π Found {len(relevant_docs)} relevant documents for query: '{query}' (ANNOY TOP-K)")
|
237 |
+
for i, doc_name in enumerate(relevant_docs):
|
238 |
+
score = doc_scores[doc_name]
|
239 |
+
logger.info(f" {i+1}. {doc_name} (similarity: {score:.3f})")
|
240 |
+
|
241 |
+
return relevant_docs
|
242 |
+
|
243 |
+
|
244 |
+
def find_relevant_documents_annoy_top_p(query: str, model: SentenceTransformer,
|
245 |
+
annoy_manager: AnnoyIndexManager, doc_tag_mapping: Dict,
|
246 |
+
top_p: float = 0.6, min_similarity: float = 0.5,
|
247 |
+
search_neighbors: int = 30) -> List[str]:
|
248 |
+
"""Find documents using TOP-P (nucleus sampling) approach with ANNOY acceleration."""
|
249 |
+
query_embedding = create_text_embedding(model, query)
|
250 |
+
|
251 |
+
# Use ANNOY to find similar tags quickly
|
252 |
+
similar_tags, distances = annoy_manager.search_tags(
|
253 |
+
query_embedding, n_neighbors=search_neighbors, include_distances=True
|
254 |
+
)
|
255 |
+
|
256 |
+
# Convert angular distances to cosine similarities
|
257 |
+
tag_similarities = {}
|
258 |
+
for tag, distance in zip(similar_tags, distances):
|
259 |
+
similarity = convert_angular_distance_to_cosine_similarity(distance)
|
260 |
+
tag_similarities[tag] = similarity
|
261 |
+
|
262 |
+
# Find documents that contain the most similar tags
|
263 |
+
doc_scores = {}
|
264 |
+
for pdf_name, doc_info in doc_tag_mapping.items():
|
265 |
+
doc_tags = doc_info['tags']
|
266 |
+
|
267 |
+
# Calculate document score using max similarity for precise tag matching
|
268 |
+
if doc_tags:
|
269 |
+
similarities = [tag_similarities.get(tag, 0) for tag in doc_tags]
|
270 |
+
# Use max similarity to find documents with best tag matches
|
271 |
+
doc_score = max(similarities)
|
272 |
+
doc_scores[pdf_name] = doc_score
|
273 |
+
|
274 |
+
# Filter out documents below minimum similarity threshold
|
275 |
+
filtered_docs = {doc: score for doc, score in doc_scores.items()
|
276 |
+
if score >= min_similarity}
|
277 |
+
|
278 |
+
if not filtered_docs:
|
279 |
+
logger.warning(f"β οΈ No documents found above similarity threshold {min_similarity}")
|
280 |
+
return []
|
281 |
+
|
282 |
+
# Sort documents by similarity score
|
283 |
+
sorted_docs = sorted(filtered_docs.items(), key=lambda x: x[1], reverse=True)
|
284 |
+
|
285 |
+
# Apply TOP-P selection
|
286 |
+
total_score = sum(score for _, score in sorted_docs)
|
287 |
+
cumulative_prob = 0.0
|
288 |
+
selected_docs = []
|
289 |
+
|
290 |
+
for doc_name, score in sorted_docs:
|
291 |
+
prob = score / total_score
|
292 |
+
cumulative_prob += prob
|
293 |
+
selected_docs.append(doc_name)
|
294 |
+
|
295 |
+
# Stop when we reach the TOP-P threshold
|
296 |
+
if cumulative_prob >= top_p:
|
297 |
+
break
|
298 |
+
|
299 |
+
logger.info(f"π Found {len(selected_docs)} relevant documents for query: '{query}' (ANNOY TOP-P={top_p})")
|
300 |
+
logger.info(f"π Cumulative probability: {cumulative_prob:.3f}")
|
301 |
+
|
302 |
+
for i, doc_name in enumerate(selected_docs):
|
303 |
+
score = doc_scores[doc_name]
|
304 |
+
prob = score / total_score
|
305 |
+
logger.info(f" {i+1}. {doc_name} (similarity: {score:.3f}, prob: {prob:.3f})")
|
306 |
+
|
307 |
+
return selected_docs
|
308 |
+
|
309 |
+
|
310 |
+
def find_relevant_documents_annoy_threshold(query: str, model: SentenceTransformer,
|
311 |
+
annoy_manager: AnnoyIndexManager, doc_tag_mapping: Dict,
|
312 |
+
similarity_threshold: float = 0.5, search_neighbors: int = 50) -> List[str]:
|
313 |
+
"""Find all documents above a similarity threshold using ANNOY acceleration."""
|
314 |
+
query_embedding = create_text_embedding(model, query)
|
315 |
+
|
316 |
+
# Use ANNOY to find similar tags quickly
|
317 |
+
similar_tags, distances = annoy_manager.search_tags(
|
318 |
+
query_embedding, n_neighbors=search_neighbors, include_distances=True
|
319 |
+
)
|
320 |
+
|
321 |
+
# Convert angular distances to cosine similarities
|
322 |
+
tag_similarities = {}
|
323 |
+
for tag, distance in zip(similar_tags, distances):
|
324 |
+
similarity = convert_angular_distance_to_cosine_similarity(distance)
|
325 |
+
tag_similarities[tag] = similarity
|
326 |
+
|
327 |
+
# Find documents that contain the most similar tags
|
328 |
+
doc_scores = {}
|
329 |
+
for pdf_name, doc_info in doc_tag_mapping.items():
|
330 |
+
doc_tags = doc_info['tags']
|
331 |
+
|
332 |
+
# Calculate document score using weighted average
|
333 |
+
if doc_tags:
|
334 |
+
similarities = [tag_similarities.get(tag, 0) for tag in doc_tags]
|
335 |
+
avg_similarity = np.mean(similarities)
|
336 |
+
max_similarity = max(similarities)
|
337 |
+
# Weighted combination: 70% average (overall relevance) + 30% max (strongest match)
|
338 |
+
doc_score = avg_similarity * 0.7 + max_similarity * 0.3
|
339 |
+
if doc_score >= similarity_threshold:
|
340 |
+
doc_scores[pdf_name] = doc_score
|
341 |
+
|
342 |
+
# Sort by similarity score
|
343 |
+
sorted_docs = sorted(doc_scores.items(), key=lambda x: x[1], reverse=True)
|
344 |
+
relevant_docs = [doc_name for doc_name, score in sorted_docs]
|
345 |
+
|
346 |
+
logger.info(f"π Found {len(relevant_docs)} relevant documents for query: '{query}' (ANNOY threshold={similarity_threshold})")
|
347 |
+
for i, doc_name in enumerate(relevant_docs):
|
348 |
+
score = doc_scores[doc_name]
|
349 |
+
logger.info(f" {i+1}. {doc_name} (similarity: {score:.3f})")
|
350 |
+
|
351 |
+
return relevant_docs
|
352 |
+
|
353 |
+
|
354 |
+
def find_relevant_documents_annoy(query: str, model: SentenceTransformer,
|
355 |
+
annoy_manager: AnnoyIndexManager, doc_tag_mapping: Dict,
|
356 |
+
strategy: str = "top_k", **kwargs) -> List[str]:
|
357 |
+
"""Unified interface for ANNOY-accelerated document retrieval with different strategies."""
|
358 |
+
if strategy == "top_k":
|
359 |
+
top_k = kwargs.get("top_k", 3)
|
360 |
+
search_neighbors = kwargs.get("search_neighbors", 20)
|
361 |
+
return find_relevant_documents_annoy_top_k(query, model, annoy_manager, doc_tag_mapping, top_k, search_neighbors)
|
362 |
+
|
363 |
+
elif strategy == "top_p":
|
364 |
+
top_p = kwargs.get("top_p", 0.6)
|
365 |
+
min_similarity = kwargs.get("min_similarity", 0.5)
|
366 |
+
search_neighbors = kwargs.get("search_neighbors", 30)
|
367 |
+
return find_relevant_documents_annoy_top_p(query, model, annoy_manager, doc_tag_mapping, top_p, min_similarity, search_neighbors)
|
368 |
+
|
369 |
+
elif strategy == "threshold":
|
370 |
+
similarity_threshold = kwargs.get("similarity_threshold", 0.5)
|
371 |
+
search_neighbors = kwargs.get("search_neighbors", 50)
|
372 |
+
return find_relevant_documents_annoy_threshold(query, model, annoy_manager, doc_tag_mapping, similarity_threshold, search_neighbors)
|
373 |
+
|
374 |
+
else:
|
375 |
+
raise ValueError(f"Unknown strategy: {strategy}. Use 'top_k', 'top_p', or 'threshold'")
|
376 |
+
|
377 |
+
|
378 |
+
def find_relevant_documents_with_fallback(query: str, model: SentenceTransformer,
|
379 |
+
tag_embeddings: Dict, doc_tag_mapping: Dict,
|
380 |
+
annoy_manager: Optional[AnnoyIndexManager] = None,
|
381 |
+
strategy: str = "top_k", **kwargs) -> List[str]:
|
382 |
+
"""
|
383 |
+
Find relevant documents with ANNOY acceleration and fallback to original method.
|
384 |
+
|
385 |
+
This function automatically uses ANNOY if available, otherwise falls back to original search.
|
386 |
+
"""
|
387 |
+
if annoy_manager is not None:
|
388 |
+
try:
|
389 |
+
logger.info("π Using ANNOY-accelerated document retrieval")
|
390 |
+
return find_relevant_documents_annoy(query, model, annoy_manager, doc_tag_mapping, strategy, **kwargs)
|
391 |
+
except Exception as e:
|
392 |
+
logger.warning(f"β οΈ ANNOY retrieval failed, falling back to original method: {e}")
|
393 |
+
|
394 |
+
# Fallback to original method
|
395 |
+
logger.info("π Using original document retrieval method")
|
396 |
+
return find_relevant_documents(query, model, tag_embeddings, doc_tag_mapping, strategy, **kwargs)
|
{src/pdf-version β customization/src}/utils/__init__.py
RENAMED
File without changes
|
{src/pdf-version β customization/src}/utils/helpers.py
RENAMED
File without changes
|
customization/test/test_pipeline.py
ADDED
@@ -0,0 +1,117 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
"""Test script to verify the customization pipeline with ANNOY indices."""
|
3 |
+
|
4 |
+
import sys
|
5 |
+
from pathlib import Path
|
6 |
+
|
7 |
+
# Add parent directory to path
|
8 |
+
sys.path.insert(0, str(Path(__file__).parent.parent))
|
9 |
+
|
10 |
+
from customization_pipeline import retrieve_document_chunks
|
11 |
+
|
12 |
+
|
13 |
+
def test_pipeline():
|
14 |
+
"""Test the complete pipeline with different queries."""
|
15 |
+
print("π§ͺ Testing Customization Pipeline with ANNOY Indices")
|
16 |
+
print("=" * 60)
|
17 |
+
|
18 |
+
# Test queries
|
19 |
+
test_queries = [
|
20 |
+
"chest pain and shortness of breath",
|
21 |
+
"pregnancy bleeding emergency",
|
22 |
+
"atrial fibrillation treatment",
|
23 |
+
"fever of unknown origin",
|
24 |
+
"dizziness diagnostic approach"
|
25 |
+
]
|
26 |
+
|
27 |
+
for query in test_queries:
|
28 |
+
print(f"\nπ Query: '{query}'")
|
29 |
+
print("-" * 60)
|
30 |
+
|
31 |
+
try:
|
32 |
+
# Retrieve chunks
|
33 |
+
results = retrieve_document_chunks(query, top_k=3)
|
34 |
+
|
35 |
+
if results:
|
36 |
+
print(f"β
Found {len(results)} relevant chunks:\n")
|
37 |
+
|
38 |
+
for i, result in enumerate(results, 1):
|
39 |
+
print(f"Result {i}:")
|
40 |
+
print(f" π Document: {result['document']}")
|
41 |
+
print(f" π Score: {result['score']:.4f}")
|
42 |
+
print(f" π Chunk ID: {result['metadata']['chunk_id']}")
|
43 |
+
print(f" π Text Preview: {result['chunk_text'][:150]}...")
|
44 |
+
print()
|
45 |
+
else:
|
46 |
+
print("β No results found")
|
47 |
+
|
48 |
+
except Exception as e:
|
49 |
+
print(f"β Error processing query: {e}")
|
50 |
+
import traceback
|
51 |
+
traceback.print_exc()
|
52 |
+
|
53 |
+
print("\n" + "=" * 60)
|
54 |
+
print("β
Pipeline test completed!")
|
55 |
+
|
56 |
+
|
57 |
+
def test_specific_medical_cases():
|
58 |
+
"""Test specific medical scenarios."""
|
59 |
+
print("\n\nπ₯ Testing Specific Medical Cases")
|
60 |
+
print("=" * 60)
|
61 |
+
|
62 |
+
medical_cases = {
|
63 |
+
"Cardiac Emergency": "acute coronary syndrome ST elevation",
|
64 |
+
"Neurological": "stroke symptoms thrombolysis window",
|
65 |
+
"Respiratory": "pulmonary embolism Wells score",
|
66 |
+
"Obstetric Emergency": "eclampsia magnesium sulfate",
|
67 |
+
"Pediatric": "pediatric seizure management"
|
68 |
+
}
|
69 |
+
|
70 |
+
for case_type, query in medical_cases.items():
|
71 |
+
print(f"\nπ {case_type}: '{query}'")
|
72 |
+
print("-" * 60)
|
73 |
+
|
74 |
+
results = retrieve_document_chunks(query, top_k=2)
|
75 |
+
|
76 |
+
if results:
|
77 |
+
for result in results:
|
78 |
+
print(f"π {result['document']}")
|
79 |
+
print(f" Score: {result['score']:.4f}")
|
80 |
+
print(f" Relevant content found in chunk {result['metadata']['chunk_id']}")
|
81 |
+
else:
|
82 |
+
print(" No specific guidance found")
|
83 |
+
|
84 |
+
|
85 |
+
def test_performance():
|
86 |
+
"""Test retrieval performance."""
|
87 |
+
import time
|
88 |
+
|
89 |
+
print("\n\nβ‘ Testing Retrieval Performance")
|
90 |
+
print("=" * 60)
|
91 |
+
|
92 |
+
queries = [
|
93 |
+
"chest pain",
|
94 |
+
"headache emergency",
|
95 |
+
"fever neutropenia",
|
96 |
+
"pneumonia antibiotics",
|
97 |
+
"atrial fibrillation"
|
98 |
+
]
|
99 |
+
|
100 |
+
total_time = 0
|
101 |
+
for query in queries:
|
102 |
+
start_time = time.time()
|
103 |
+
results = retrieve_document_chunks(query, top_k=5)
|
104 |
+
elapsed = time.time() - start_time
|
105 |
+
total_time += elapsed
|
106 |
+
|
107 |
+
print(f"Query: '{query}' - Retrieved {len(results)} chunks in {elapsed:.3f}s")
|
108 |
+
|
109 |
+
avg_time = total_time / len(queries)
|
110 |
+
print(f"\nπ Average retrieval time: {avg_time:.3f}s per query")
|
111 |
+
|
112 |
+
|
113 |
+
if __name__ == "__main__":
|
114 |
+
# Run all tests
|
115 |
+
test_pipeline()
|
116 |
+
test_specific_medical_cases()
|
117 |
+
test_performance()
|
customization_requirements.txt
ADDED
@@ -0,0 +1,188 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Customization Pipeline Requirements
|
2 |
+
# Generated from rag_env environment for hospital-specific document processing
|
3 |
+
#
|
4 |
+
# Key libraries:
|
5 |
+
# - sentence-transformers: Medical domain embeddings (BGE-Large-Medical)
|
6 |
+
# - torch: Deep learning framework
|
7 |
+
# - annoy: Fast vector similarity search indices
|
8 |
+
# - pdfplumber: PDF text and table extraction
|
9 |
+
# - llama-index: Document chunking and processing
|
10 |
+
# - transformers: Hugging Face model support
|
11 |
+
# - openai: LLM integration (optional)
|
12 |
+
#
|
13 |
+
# Install with: pip install -r customization_requirements.txt
|
14 |
+
#
|
15 |
+
accelerate==1.9.0
|
16 |
+
acres==0.5.0
|
17 |
+
aiohappyeyeballs==2.6.1
|
18 |
+
aiohttp==3.12.14
|
19 |
+
aiosignal==1.4.0
|
20 |
+
aiosqlite==0.21.0
|
21 |
+
annotated-types==0.7.0
|
22 |
+
annoy==1.17.3
|
23 |
+
anyio==4.9.0
|
24 |
+
appnope==0.1.4
|
25 |
+
asttokens==3.0.0
|
26 |
+
attrs==25.3.0
|
27 |
+
banks==2.2.0
|
28 |
+
beautifulsoup4==4.13.4
|
29 |
+
bm25s==0.2.13
|
30 |
+
certifi==2025.7.14
|
31 |
+
cffi==1.17.1
|
32 |
+
charset-normalizer==3.4.2
|
33 |
+
ci-info==0.3.0
|
34 |
+
click==8.2.1
|
35 |
+
colorama==0.4.6
|
36 |
+
comm==0.2.2
|
37 |
+
configobj==5.0.9
|
38 |
+
configparser==7.2.0
|
39 |
+
cryptography==45.0.5
|
40 |
+
dataclasses-json==0.6.7
|
41 |
+
debugpy==1.8.15
|
42 |
+
decorator==5.2.1
|
43 |
+
defusedxml==0.7.1
|
44 |
+
Deprecated==1.2.18
|
45 |
+
dirtyjson==1.0.8
|
46 |
+
distro==1.9.0
|
47 |
+
easyocr==1.7.2
|
48 |
+
etelemetry==0.3.1
|
49 |
+
executing==2.2.0
|
50 |
+
filelock==3.18.0
|
51 |
+
filetype==1.2.0
|
52 |
+
fitz==0.0.1.dev2
|
53 |
+
frozenlist==1.7.0
|
54 |
+
fsspec==2025.7.0
|
55 |
+
greenlet==3.2.3
|
56 |
+
griffe==1.7.3
|
57 |
+
h11==0.16.0
|
58 |
+
hf-xet==1.1.5
|
59 |
+
httpcore==1.0.9
|
60 |
+
httplib2==0.22.0
|
61 |
+
httpx==0.28.1
|
62 |
+
huggingface-hub==0.33.4
|
63 |
+
idna==3.10
|
64 |
+
imageio==2.37.0
|
65 |
+
ipykernel==6.30.0
|
66 |
+
ipython==9.4.0
|
67 |
+
ipython_pygments_lexers==1.1.1
|
68 |
+
jedi==0.19.2
|
69 |
+
Jinja2==3.1.6
|
70 |
+
jiter==0.10.0
|
71 |
+
joblib==1.5.1
|
72 |
+
jpype1==1.6.0
|
73 |
+
jupyter_client==8.6.3
|
74 |
+
jupyter_core==5.8.1
|
75 |
+
lazy_loader==0.4
|
76 |
+
llama-cloud==0.1.32
|
77 |
+
llama-cloud-services==0.6.43
|
78 |
+
llama-index==0.12.50
|
79 |
+
llama-index-agent-openai==0.4.12
|
80 |
+
llama-index-cli==0.4.4
|
81 |
+
llama-index-core==0.12.50
|
82 |
+
llama-index-embeddings-huggingface==0.5.5
|
83 |
+
llama-index-embeddings-openai==0.3.1
|
84 |
+
llama-index-indices-managed-llama-cloud==0.7.10
|
85 |
+
llama-index-instrumentation==0.3.0
|
86 |
+
llama-index-llms-huggingface==0.5.0
|
87 |
+
llama-index-llms-openai==0.4.7
|
88 |
+
llama-index-llms-openai-like==0.4.0
|
89 |
+
llama-index-llms-openrouter==0.3.2
|
90 |
+
llama-index-multi-modal-llms-openai==0.5.3
|
91 |
+
llama-index-program-openai==0.3.2
|
92 |
+
llama-index-question-gen-openai==0.3.1
|
93 |
+
llama-index-readers-file==0.4.11
|
94 |
+
llama-index-readers-llama-parse==0.4.0
|
95 |
+
llama-index-retrievers-bm25==0.5.2
|
96 |
+
llama-index-workflows==1.1.0
|
97 |
+
llama-parse==0.6.43
|
98 |
+
looseversion==1.3.0
|
99 |
+
lxml==6.0.0
|
100 |
+
MarkupSafe==3.0.2
|
101 |
+
marshmallow==3.26.1
|
102 |
+
matplotlib-inline==0.1.7
|
103 |
+
mpmath==1.3.0
|
104 |
+
multidict==6.6.3
|
105 |
+
mypy_extensions==1.1.0
|
106 |
+
nest-asyncio==1.6.0
|
107 |
+
networkx==3.5
|
108 |
+
nibabel==5.3.2
|
109 |
+
ninja==1.11.1.4
|
110 |
+
nipype==1.10.0
|
111 |
+
nltk==3.9.1
|
112 |
+
numpy==2.2.6
|
113 |
+
openai==1.97.0
|
114 |
+
opencv-python-headless==4.12.0.88
|
115 |
+
packaging==25.0
|
116 |
+
pandas==2.2.3
|
117 |
+
parso==0.8.4
|
118 |
+
pathlib==1.0.1
|
119 |
+
pdfminer.six==20250506
|
120 |
+
pdfplumber==0.11.7
|
121 |
+
pexpect==4.9.0
|
122 |
+
pillow==11.3.0
|
123 |
+
platformdirs==4.3.8
|
124 |
+
prompt_toolkit==3.0.51
|
125 |
+
propcache==0.3.2
|
126 |
+
prov==2.1.1
|
127 |
+
psutil==7.0.0
|
128 |
+
ptyprocess==0.7.0
|
129 |
+
pure_eval==0.2.3
|
130 |
+
puremagic==1.30
|
131 |
+
pyclipper==1.3.0.post6
|
132 |
+
pycparser==2.22
|
133 |
+
pydantic==2.11.7
|
134 |
+
pydantic_core==2.33.2
|
135 |
+
pydot==4.0.1
|
136 |
+
Pygments==2.19.2
|
137 |
+
PyMuPDF==1.26.3
|
138 |
+
pyparsing==3.2.3
|
139 |
+
pypdf==5.8.0
|
140 |
+
pypdfium2==4.30.0
|
141 |
+
PyStemmer==2.2.0.3
|
142 |
+
python-bidi==0.6.6
|
143 |
+
python-dateutil==2.9.0.post0
|
144 |
+
python-dotenv==1.1.1
|
145 |
+
pytz==2025.2
|
146 |
+
pyxnat==1.6.3
|
147 |
+
PyYAML==6.0.2
|
148 |
+
pyzmq==27.0.0
|
149 |
+
rdflib==7.1.4
|
150 |
+
regex==2024.11.6
|
151 |
+
requests==2.32.4
|
152 |
+
safetensors==0.5.3
|
153 |
+
scikit-image==0.25.2
|
154 |
+
scikit-learn==1.7.1
|
155 |
+
scipy==1.16.0
|
156 |
+
sentence-transformers==5.0.0
|
157 |
+
setuptools==80.9.0
|
158 |
+
shapely==2.1.1
|
159 |
+
simplejson==3.20.1
|
160 |
+
six==1.17.0
|
161 |
+
sniffio==1.3.1
|
162 |
+
soupsieve==2.7
|
163 |
+
SQLAlchemy==2.0.41
|
164 |
+
stack-data==0.6.3
|
165 |
+
striprtf==0.0.26
|
166 |
+
sympy==1.14.0
|
167 |
+
tabula-py==2.10.0
|
168 |
+
tabulate==0.9.0
|
169 |
+
tenacity==9.1.2
|
170 |
+
threadpoolctl==3.6.0
|
171 |
+
tifffile==2025.6.11
|
172 |
+
tiktoken==0.9.0
|
173 |
+
tokenizers==0.21.2
|
174 |
+
torch==2.7.1
|
175 |
+
torchvision==0.22.1
|
176 |
+
tornado==6.5.1
|
177 |
+
tqdm==4.67.1
|
178 |
+
traitlets==5.14.3
|
179 |
+
traits==7.0.2
|
180 |
+
transformers==4.53.2
|
181 |
+
typing-inspect==0.9.0
|
182 |
+
typing-inspection==0.4.1
|
183 |
+
typing_extensions==4.14.1
|
184 |
+
tzdata==2025.2
|
185 |
+
urllib3==2.5.0
|
186 |
+
wcwidth==0.2.13
|
187 |
+
wrapt==1.17.2
|
188 |
+
yarl==1.20.1
|
src/pdf-version/data/__init__.py
DELETED
@@ -1,15 +0,0 @@
|
|
1 |
-
"""Data loading and PDF processing."""
|
2 |
-
|
3 |
-
from .loaders import load_annotations, filter_pdf_files
|
4 |
-
from .pdf_processing import (
|
5 |
-
extract_pdf_text,
|
6 |
-
extract_tables_from_pdf,
|
7 |
-
extract_images_ocr_from_pdf,
|
8 |
-
extract_pdf_content_enhanced
|
9 |
-
)
|
10 |
-
|
11 |
-
__all__ = [
|
12 |
-
'load_annotations', 'filter_pdf_files',
|
13 |
-
'extract_pdf_text', 'extract_tables_from_pdf',
|
14 |
-
'extract_images_ocr_from_pdf', 'extract_pdf_content_enhanced'
|
15 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/pdf-version/main.py
DELETED
@@ -1,83 +0,0 @@
|
|
1 |
-
#!/usr/bin/env python3
|
2 |
-
"""OnCall AI - Medical RAG System
|
3 |
-
|
4 |
-
Main entry point for the medical RAG system.
|
5 |
-
"""
|
6 |
-
|
7 |
-
import sys
|
8 |
-
from pathlib import Path
|
9 |
-
|
10 |
-
# Add pdf-version directory to Python path
|
11 |
-
sys.path.insert(0, str(Path(__file__).parent))
|
12 |
-
|
13 |
-
from demos.demo_runner import build_medical_rag_system, demo_rag_query, demo_all_strategies
|
14 |
-
|
15 |
-
|
16 |
-
def main():
|
17 |
-
"""Main program entry point."""
|
18 |
-
try:
|
19 |
-
# Build the system with chunk embeddings
|
20 |
-
build_medical_rag_system(enable_chunk_embeddings=True)
|
21 |
-
|
22 |
-
# Demo chunk-based retrieval
|
23 |
-
print("\n" + "="*80)
|
24 |
-
print("π§© CHUNK-BASED RETRIEVAL DEMO")
|
25 |
-
print("="*80)
|
26 |
-
demo_rag_query("chest pain and shortness of breath",
|
27 |
-
strategy="top_p", use_chunks=True, top_p=0.8)
|
28 |
-
|
29 |
-
except KeyboardInterrupt:
|
30 |
-
print("\n\nπ User interrupted, program exiting")
|
31 |
-
except Exception as e:
|
32 |
-
print(f"\nβ Program execution error: {e}")
|
33 |
-
import traceback
|
34 |
-
traceback.print_exc()
|
35 |
-
|
36 |
-
|
37 |
-
def interactive_demo():
|
38 |
-
"""Interactive demo mode."""
|
39 |
-
print("π₯ OnCall AI - Interactive Demo Mode")
|
40 |
-
print("=" * 50)
|
41 |
-
|
42 |
-
while True:
|
43 |
-
print("\nOptions:")
|
44 |
-
print("1. Build/rebuild system")
|
45 |
-
print("2. Query with TOP-P strategy")
|
46 |
-
print("3. Query with TOP-K strategy")
|
47 |
-
print("4. Compare all strategies")
|
48 |
-
print("5. Custom query")
|
49 |
-
print("6. Exit")
|
50 |
-
|
51 |
-
choice = input("\nSelect option (1-6): ").strip()
|
52 |
-
|
53 |
-
if choice == "1":
|
54 |
-
build_medical_rag_system(enable_chunk_embeddings=True)
|
55 |
-
elif choice == "2":
|
56 |
-
query = input("Enter your query: ").strip()
|
57 |
-
if query:
|
58 |
-
demo_rag_query(query, strategy="top_p", use_chunks=True)
|
59 |
-
elif choice == "3":
|
60 |
-
query = input("Enter your query: ").strip()
|
61 |
-
if query:
|
62 |
-
demo_rag_query(query, strategy="top_k", use_chunks=True, top_k=3)
|
63 |
-
elif choice == "4":
|
64 |
-
query = input("Enter your query: ").strip()
|
65 |
-
if query:
|
66 |
-
demo_all_strategies(query)
|
67 |
-
elif choice == "5":
|
68 |
-
query = input("Enter your query: ").strip()
|
69 |
-
strategy = input("Enter strategy (top_k/top_p/threshold): ").strip()
|
70 |
-
if query and strategy:
|
71 |
-
demo_rag_query(query, strategy=strategy, use_chunks=True)
|
72 |
-
elif choice == "6":
|
73 |
-
print("π Goodbye!")
|
74 |
-
break
|
75 |
-
else:
|
76 |
-
print("β Invalid option. Please select 1-6.")
|
77 |
-
|
78 |
-
|
79 |
-
if __name__ == "__main__":
|
80 |
-
if len(sys.argv) > 1 and sys.argv[1] == "--interactive":
|
81 |
-
interactive_demo()
|
82 |
-
else:
|
83 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/pdf-version/oncall_ai.py
DELETED
@@ -1,55 +0,0 @@
|
|
1 |
-
#!/usr/bin/env python3
|
2 |
-
"""OnCall AI - Medical RAG System (Backward Compatibility)
|
3 |
-
|
4 |
-
This file provides backward compatibility with the original rag.py interface.
|
5 |
-
Import everything from the new modular structure.
|
6 |
-
"""
|
7 |
-
|
8 |
-
import sys
|
9 |
-
from pathlib import Path
|
10 |
-
|
11 |
-
# Add pdf-version directory to Python path
|
12 |
-
sys.path.insert(0, str(Path(__file__).parent))
|
13 |
-
|
14 |
-
# Import all functions for backward compatibility
|
15 |
-
from models.embedding_models import load_biomedbert_model, load_meditron_model
|
16 |
-
from data.loaders import load_annotations, filter_pdf_files
|
17 |
-
from data.pdf_processing import (
|
18 |
-
extract_pdf_text, extract_tables_from_pdf,
|
19 |
-
extract_images_ocr_from_pdf, extract_pdf_content_enhanced
|
20 |
-
)
|
21 |
-
from indexing.document_indexer import build_document_index, split_text_into_chunks
|
22 |
-
from indexing.embedding_creator import create_text_embedding, create_tag_embeddings, create_chunk_embeddings
|
23 |
-
from indexing.storage import save_document_system, load_document_system
|
24 |
-
from retrieval.document_retriever import (
|
25 |
-
find_relevant_documents_top_k, find_relevant_documents_top_p,
|
26 |
-
find_relevant_documents_threshold, find_relevant_documents,
|
27 |
-
create_document_tag_mapping
|
28 |
-
)
|
29 |
-
from retrieval.chunk_retriever import find_relevant_chunks, get_documents_for_rag, get_chunks_for_rag
|
30 |
-
from demos.demo_runner import build_medical_rag_system, demo_rag_query, demo_all_strategies
|
31 |
-
|
32 |
-
# Main function for backward compatibility
|
33 |
-
def main():
|
34 |
-
"""Main program entry compatible with original rag.py."""
|
35 |
-
try:
|
36 |
-
# Build the system with chunk embeddings
|
37 |
-
build_medical_rag_system(enable_chunk_embeddings=True)
|
38 |
-
|
39 |
-
# Demo chunk-based retrieval
|
40 |
-
print("\n" + "="*80)
|
41 |
-
print("π§© CHUNK-BASED RETRIEVAL DEMO")
|
42 |
-
print("="*80)
|
43 |
-
demo_rag_query("chest pain and shortness of breath",
|
44 |
-
strategy="top_p", use_chunks=True, top_p=0.8)
|
45 |
-
|
46 |
-
except KeyboardInterrupt:
|
47 |
-
print("\n\nπ User interrupted, program exiting")
|
48 |
-
except Exception as e:
|
49 |
-
print(f"\nβ Program execution error: {e}")
|
50 |
-
import traceback
|
51 |
-
traceback.print_exc()
|
52 |
-
|
53 |
-
|
54 |
-
if __name__ == "__main__":
|
55 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|