Spaces:
Sleeping
Sleeping
modifying path to align with current file structure. folder structure needs rework
Browse files- src/pdf-version/data/loaders.py +14 -2
- src/pdf-version/demos/demo_runner.py +7 -7
- src/pdf-version/generate_embeddings.py +3 -3
- src/pdf-version/indexing/document_indexer.py +1 -1
- src/pdf-version/indexing/storage.py +17 -2
- src/pdf-version/main.py +3 -3
- src/pdf-version/oncall_ai.py +11 -11
- src/pdf-version/rag/medical_rag_pipeline.py +26 -8
src/pdf-version/data/loaders.py
CHANGED
@@ -5,7 +5,7 @@ import os
|
|
5 |
from typing import List, Dict
|
6 |
|
7 |
|
8 |
-
def load_annotations(file_path: str =
|
9 |
"""Load medical annotations from JSON file.
|
10 |
|
11 |
Args:
|
@@ -14,6 +14,12 @@ def load_annotations(file_path: str = 'mapping.json') -> List[Dict]:
|
|
14 |
Returns:
|
15 |
List of annotation dictionaries.
|
16 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
try:
|
18 |
with open(file_path, 'r', encoding='utf-8') as f:
|
19 |
annotations = json.load(f)
|
@@ -25,7 +31,7 @@ def load_annotations(file_path: str = 'mapping.json') -> List[Dict]:
|
|
25 |
return []
|
26 |
|
27 |
|
28 |
-
def filter_pdf_files(annotations: List[Dict], assets_dir: str =
|
29 |
"""Filter and validate PDF files from annotations.
|
30 |
|
31 |
Args:
|
@@ -35,6 +41,12 @@ def filter_pdf_files(annotations: List[Dict], assets_dir: str = "assets") -> Lis
|
|
35 |
Returns:
|
36 |
List of valid PDF filenames.
|
37 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
pdf_files = []
|
39 |
|
40 |
for item in annotations:
|
|
|
5 |
from typing import List, Dict
|
6 |
|
7 |
|
8 |
+
def load_annotations(file_path: str = None) -> List[Dict]:
|
9 |
"""Load medical annotations from JSON file.
|
10 |
|
11 |
Args:
|
|
|
14 |
Returns:
|
15 |
List of annotation dictionaries.
|
16 |
"""
|
17 |
+
if file_path is None:
|
18 |
+
# Get project root directory (3 levels up from this file)
|
19 |
+
from pathlib import Path
|
20 |
+
root_dir = Path(__file__).parent.parent.parent.parent
|
21 |
+
file_path = root_dir / 'embeddings' / 'mapping.json'
|
22 |
+
|
23 |
try:
|
24 |
with open(file_path, 'r', encoding='utf-8') as f:
|
25 |
annotations = json.load(f)
|
|
|
31 |
return []
|
32 |
|
33 |
|
34 |
+
def filter_pdf_files(annotations: List[Dict], assets_dir: str = None) -> List[str]:
|
35 |
"""Filter and validate PDF files from annotations.
|
36 |
|
37 |
Args:
|
|
|
41 |
Returns:
|
42 |
List of valid PDF filenames.
|
43 |
"""
|
44 |
+
if assets_dir is None:
|
45 |
+
# Get project root directory
|
46 |
+
from pathlib import Path
|
47 |
+
root_dir = Path(__file__).parent.parent.parent.parent
|
48 |
+
assets_dir = root_dir / 'assets'
|
49 |
+
|
50 |
pdf_files = []
|
51 |
|
52 |
for item in annotations:
|
src/pdf-version/demos/demo_runner.py
CHANGED
@@ -2,13 +2,13 @@
|
|
2 |
|
3 |
from typing import Optional
|
4 |
|
5 |
-
from
|
6 |
-
from
|
7 |
-
from
|
8 |
-
from
|
9 |
-
from
|
10 |
-
from
|
11 |
-
from
|
12 |
|
13 |
|
14 |
def build_medical_rag_system(enable_chunk_embeddings: bool = True):
|
|
|
2 |
|
3 |
from typing import Optional
|
4 |
|
5 |
+
from models.embedding_models import load_biomedbert_model
|
6 |
+
from data.loaders import load_annotations
|
7 |
+
from indexing.document_indexer import build_document_index
|
8 |
+
from indexing.embedding_creator import create_tag_embeddings, create_chunk_embeddings
|
9 |
+
from indexing.storage import save_document_system, load_document_system
|
10 |
+
from retrieval.document_retriever import create_document_tag_mapping, find_relevant_documents
|
11 |
+
from retrieval.chunk_retriever import find_relevant_chunks, get_documents_for_rag, get_chunks_for_rag
|
12 |
|
13 |
|
14 |
def build_medical_rag_system(enable_chunk_embeddings: bool = True):
|
src/pdf-version/generate_embeddings.py
CHANGED
@@ -6,10 +6,10 @@ Quick script to generate new embeddings with sentence-based chunking
|
|
6 |
import sys
|
7 |
from pathlib import Path
|
8 |
|
9 |
-
# Add
|
10 |
-
sys.path.insert(0, str(Path(__file__).parent
|
11 |
|
12 |
-
from
|
13 |
|
14 |
def main():
|
15 |
print("π Starting to build medical RAG system with new sentence-based chunking...")
|
|
|
6 |
import sys
|
7 |
from pathlib import Path
|
8 |
|
9 |
+
# Add pdf-version directory to Python path
|
10 |
+
sys.path.insert(0, str(Path(__file__).parent))
|
11 |
|
12 |
+
from demos.demo_runner import build_medical_rag_system
|
13 |
|
14 |
def main():
|
15 |
print("π Starting to build medical RAG system with new sentence-based chunking...")
|
src/pdf-version/indexing/document_indexer.py
CHANGED
@@ -4,7 +4,7 @@ import os
|
|
4 |
from typing import List, Dict
|
5 |
from llama_index.core import Document
|
6 |
from llama_index.core.node_parser import SentenceSplitter
|
7 |
-
from
|
8 |
|
9 |
|
10 |
def split_text_into_chunks(text: str, chunk_size: int = 256, chunk_overlap: int = 25) -> List[Dict]:
|
|
|
4 |
from typing import List, Dict
|
5 |
from llama_index.core import Document
|
6 |
from llama_index.core.node_parser import SentenceSplitter
|
7 |
+
from data.pdf_processing import extract_pdf_content_enhanced
|
8 |
|
9 |
|
10 |
def split_text_into_chunks(text: str, chunk_size: int = 256, chunk_overlap: int = 25) -> List[Dict]:
|
src/pdf-version/indexing/storage.py
CHANGED
@@ -8,7 +8,7 @@ import numpy as np
|
|
8 |
|
9 |
def save_document_system(document_index: Dict, tag_embeddings: Dict,
|
10 |
doc_tag_mapping: Dict, chunk_embeddings: Dict = None,
|
11 |
-
output_dir: str =
|
12 |
"""Save the complete document indexing system.
|
13 |
|
14 |
Args:
|
@@ -19,6 +19,15 @@ def save_document_system(document_index: Dict, tag_embeddings: Dict,
|
|
19 |
output_dir: Output directory for saved files.
|
20 |
"""
|
21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
# Save document index (content + metadata + chunks)
|
23 |
doc_index_serializable = {}
|
24 |
for doc_name, doc_info in document_index.items():
|
@@ -79,7 +88,7 @@ def save_document_system(document_index: Dict, tag_embeddings: Dict,
|
|
79 |
print("β
Document system saved to files")
|
80 |
|
81 |
|
82 |
-
def load_document_system(input_dir: str =
|
83 |
"""Load the complete document indexing system.
|
84 |
|
85 |
Args:
|
@@ -89,6 +98,12 @@ def load_document_system(input_dir: str = ".") -> Tuple[Optional[Dict], Optional
|
|
89 |
Tuple of (document_index, tag_embeddings, doc_tag_mapping, chunk_embeddings).
|
90 |
Returns (None, None, None, None) if loading fails.
|
91 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
92 |
try:
|
93 |
# Load document index
|
94 |
with open(os.path.join(input_dir, 'document_index.json'), 'r', encoding='utf-8') as f:
|
|
|
8 |
|
9 |
def save_document_system(document_index: Dict, tag_embeddings: Dict,
|
10 |
doc_tag_mapping: Dict, chunk_embeddings: Dict = None,
|
11 |
+
output_dir: str = None):
|
12 |
"""Save the complete document indexing system.
|
13 |
|
14 |
Args:
|
|
|
19 |
output_dir: Output directory for saved files.
|
20 |
"""
|
21 |
|
22 |
+
if output_dir is None:
|
23 |
+
# Get project root directory
|
24 |
+
from pathlib import Path
|
25 |
+
root_dir = Path(__file__).parent.parent.parent.parent
|
26 |
+
output_dir = root_dir / 'embeddings' / 'pdfembeddings'
|
27 |
+
|
28 |
+
# Ensure output directory exists
|
29 |
+
os.makedirs(output_dir, exist_ok=True)
|
30 |
+
|
31 |
# Save document index (content + metadata + chunks)
|
32 |
doc_index_serializable = {}
|
33 |
for doc_name, doc_info in document_index.items():
|
|
|
88 |
print("β
Document system saved to files")
|
89 |
|
90 |
|
91 |
+
def load_document_system(input_dir: str = None) -> Tuple[Optional[Dict], Optional[Dict], Optional[Dict], Optional[Dict]]:
|
92 |
"""Load the complete document indexing system.
|
93 |
|
94 |
Args:
|
|
|
98 |
Tuple of (document_index, tag_embeddings, doc_tag_mapping, chunk_embeddings).
|
99 |
Returns (None, None, None, None) if loading fails.
|
100 |
"""
|
101 |
+
if input_dir is None:
|
102 |
+
# Get project root directory
|
103 |
+
from pathlib import Path
|
104 |
+
root_dir = Path(__file__).parent.parent.parent.parent
|
105 |
+
input_dir = root_dir / 'embeddings' / 'pdfembeddings'
|
106 |
+
|
107 |
try:
|
108 |
# Load document index
|
109 |
with open(os.path.join(input_dir, 'document_index.json'), 'r', encoding='utf-8') as f:
|
src/pdf-version/main.py
CHANGED
@@ -7,10 +7,10 @@ Main entry point for the medical RAG system.
|
|
7 |
import sys
|
8 |
from pathlib import Path
|
9 |
|
10 |
-
# Add
|
11 |
-
sys.path.insert(0, str(Path(__file__).parent
|
12 |
|
13 |
-
from
|
14 |
|
15 |
|
16 |
def main():
|
|
|
7 |
import sys
|
8 |
from pathlib import Path
|
9 |
|
10 |
+
# Add pdf-version directory to Python path
|
11 |
+
sys.path.insert(0, str(Path(__file__).parent))
|
12 |
|
13 |
+
from demos.demo_runner import build_medical_rag_system, demo_rag_query, demo_all_strategies
|
14 |
|
15 |
|
16 |
def main():
|
src/pdf-version/oncall_ai.py
CHANGED
@@ -8,26 +8,26 @@ Import everything from the new modular structure.
|
|
8 |
import sys
|
9 |
from pathlib import Path
|
10 |
|
11 |
-
# Add
|
12 |
-
sys.path.insert(0, str(Path(__file__).parent
|
13 |
|
14 |
# Import all functions for backward compatibility
|
15 |
-
from
|
16 |
-
from
|
17 |
-
from
|
18 |
extract_pdf_text, extract_tables_from_pdf,
|
19 |
extract_images_ocr_from_pdf, extract_pdf_content_enhanced
|
20 |
)
|
21 |
-
from
|
22 |
-
from
|
23 |
-
from
|
24 |
-
from
|
25 |
find_relevant_documents_top_k, find_relevant_documents_top_p,
|
26 |
find_relevant_documents_threshold, find_relevant_documents,
|
27 |
create_document_tag_mapping
|
28 |
)
|
29 |
-
from
|
30 |
-
from
|
31 |
|
32 |
# Main function for backward compatibility
|
33 |
def main():
|
|
|
8 |
import sys
|
9 |
from pathlib import Path
|
10 |
|
11 |
+
# Add pdf-version directory to Python path
|
12 |
+
sys.path.insert(0, str(Path(__file__).parent))
|
13 |
|
14 |
# Import all functions for backward compatibility
|
15 |
+
from models.embedding_models import load_biomedbert_model, load_meditron_model
|
16 |
+
from data.loaders import load_annotations, filter_pdf_files
|
17 |
+
from data.pdf_processing import (
|
18 |
extract_pdf_text, extract_tables_from_pdf,
|
19 |
extract_images_ocr_from_pdf, extract_pdf_content_enhanced
|
20 |
)
|
21 |
+
from indexing.document_indexer import build_document_index, split_text_into_chunks
|
22 |
+
from indexing.embedding_creator import create_text_embedding, create_tag_embeddings, create_chunk_embeddings
|
23 |
+
from indexing.storage import save_document_system, load_document_system
|
24 |
+
from retrieval.document_retriever import (
|
25 |
find_relevant_documents_top_k, find_relevant_documents_top_p,
|
26 |
find_relevant_documents_threshold, find_relevant_documents,
|
27 |
create_document_tag_mapping
|
28 |
)
|
29 |
+
from retrieval.chunk_retriever import find_relevant_chunks, get_documents_for_rag, get_chunks_for_rag
|
30 |
+
from demos.demo_runner import build_medical_rag_system, demo_rag_query, demo_all_strategies
|
31 |
|
32 |
# Main function for backward compatibility
|
33 |
def main():
|
src/pdf-version/rag/medical_rag_pipeline.py
CHANGED
@@ -7,10 +7,6 @@ from typing import Dict, List, Optional, Tuple
|
|
7 |
from sentence_transformers import SentenceTransformer
|
8 |
|
9 |
# Import existing retrieval components
|
10 |
-
import sys
|
11 |
-
import os
|
12 |
-
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
13 |
-
|
14 |
from retrieval.document_retriever import find_relevant_documents
|
15 |
from retrieval.chunk_retriever import find_relevant_chunks, get_chunks_for_rag
|
16 |
from models.embedding_models import load_biomedbert_model
|
@@ -391,10 +387,10 @@ def answer_medical_query(query: str,
|
|
391 |
return complete_result
|
392 |
|
393 |
|
394 |
-
def load_rag_data(tag_embeddings_path: str =
|
395 |
-
chunk_embeddings_path: str =
|
396 |
-
doc_tag_mapping_path: str =
|
397 |
-
document_index_path: str =
|
398 |
"""
|
399 |
Load all RAG data needed for medical question answering.
|
400 |
|
@@ -409,6 +405,28 @@ def load_rag_data(tag_embeddings_path: str = "tag_embeddings.json",
|
|
409 |
"""
|
410 |
print("π Loading Medical RAG Data...")
|
411 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
412 |
# Load embedding model
|
413 |
print("π¦ Loading BGE Large Medical embedding model...")
|
414 |
embedding_model = load_biomedbert_model()
|
|
|
7 |
from sentence_transformers import SentenceTransformer
|
8 |
|
9 |
# Import existing retrieval components
|
|
|
|
|
|
|
|
|
10 |
from retrieval.document_retriever import find_relevant_documents
|
11 |
from retrieval.chunk_retriever import find_relevant_chunks, get_chunks_for_rag
|
12 |
from models.embedding_models import load_biomedbert_model
|
|
|
387 |
return complete_result
|
388 |
|
389 |
|
390 |
+
def load_rag_data(tag_embeddings_path: str = None,
|
391 |
+
chunk_embeddings_path: str = None,
|
392 |
+
doc_tag_mapping_path: str = None,
|
393 |
+
document_index_path: str = None) -> Tuple[SentenceTransformer, Dict, Dict, Dict, Dict]:
|
394 |
"""
|
395 |
Load all RAG data needed for medical question answering.
|
396 |
|
|
|
405 |
"""
|
406 |
print("π Loading Medical RAG Data...")
|
407 |
|
408 |
+
# Set default paths if not provided
|
409 |
+
if tag_embeddings_path is None:
|
410 |
+
from pathlib import Path
|
411 |
+
root_dir = Path(__file__).parent.parent.parent.parent
|
412 |
+
embeddings_dir = root_dir / 'embeddings' / 'pdfembeddings'
|
413 |
+
tag_embeddings_path = embeddings_dir / 'tag_embeddings.json'
|
414 |
+
if chunk_embeddings_path is None:
|
415 |
+
from pathlib import Path
|
416 |
+
root_dir = Path(__file__).parent.parent.parent.parent
|
417 |
+
embeddings_dir = root_dir / 'embeddings' / 'pdfembeddings'
|
418 |
+
chunk_embeddings_path = embeddings_dir / 'chunk_embeddings.json'
|
419 |
+
if doc_tag_mapping_path is None:
|
420 |
+
from pathlib import Path
|
421 |
+
root_dir = Path(__file__).parent.parent.parent.parent
|
422 |
+
embeddings_dir = root_dir / 'embeddings' / 'pdfembeddings'
|
423 |
+
doc_tag_mapping_path = embeddings_dir / 'document_tag_mapping.json'
|
424 |
+
if document_index_path is None:
|
425 |
+
from pathlib import Path
|
426 |
+
root_dir = Path(__file__).parent.parent.parent.parent
|
427 |
+
embeddings_dir = root_dir / 'embeddings' / 'pdfembeddings'
|
428 |
+
document_index_path = embeddings_dir / 'document_index.json'
|
429 |
+
|
430 |
# Load embedding model
|
431 |
print("π¦ Loading BGE Large Medical embedding model...")
|
432 |
embedding_model = load_biomedbert_model()
|