VanKee commited on
Commit
42d7509
Β·
1 Parent(s): 5c8e4ec

modifying path to align with current file structure. folder structure needs rework

Browse files
src/pdf-version/data/loaders.py CHANGED
@@ -5,7 +5,7 @@ import os
5
  from typing import List, Dict
6
 
7
 
8
- def load_annotations(file_path: str = 'mapping.json') -> List[Dict]:
9
  """Load medical annotations from JSON file.
10
 
11
  Args:
@@ -14,6 +14,12 @@ def load_annotations(file_path: str = 'mapping.json') -> List[Dict]:
14
  Returns:
15
  List of annotation dictionaries.
16
  """
 
 
 
 
 
 
17
  try:
18
  with open(file_path, 'r', encoding='utf-8') as f:
19
  annotations = json.load(f)
@@ -25,7 +31,7 @@ def load_annotations(file_path: str = 'mapping.json') -> List[Dict]:
25
  return []
26
 
27
 
28
- def filter_pdf_files(annotations: List[Dict], assets_dir: str = "assets") -> List[str]:
29
  """Filter and validate PDF files from annotations.
30
 
31
  Args:
@@ -35,6 +41,12 @@ def filter_pdf_files(annotations: List[Dict], assets_dir: str = "assets") -> Lis
35
  Returns:
36
  List of valid PDF filenames.
37
  """
 
 
 
 
 
 
38
  pdf_files = []
39
 
40
  for item in annotations:
 
5
  from typing import List, Dict
6
 
7
 
8
+ def load_annotations(file_path: str = None) -> List[Dict]:
9
  """Load medical annotations from JSON file.
10
 
11
  Args:
 
14
  Returns:
15
  List of annotation dictionaries.
16
  """
17
+ if file_path is None:
18
+ # Get project root directory (3 levels up from this file)
19
+ from pathlib import Path
20
+ root_dir = Path(__file__).parent.parent.parent.parent
21
+ file_path = root_dir / 'embeddings' / 'mapping.json'
22
+
23
  try:
24
  with open(file_path, 'r', encoding='utf-8') as f:
25
  annotations = json.load(f)
 
31
  return []
32
 
33
 
34
+ def filter_pdf_files(annotations: List[Dict], assets_dir: str = None) -> List[str]:
35
  """Filter and validate PDF files from annotations.
36
 
37
  Args:
 
41
  Returns:
42
  List of valid PDF filenames.
43
  """
44
+ if assets_dir is None:
45
+ # Get project root directory
46
+ from pathlib import Path
47
+ root_dir = Path(__file__).parent.parent.parent.parent
48
+ assets_dir = root_dir / 'assets'
49
+
50
  pdf_files = []
51
 
52
  for item in annotations:
src/pdf-version/demos/demo_runner.py CHANGED
@@ -2,13 +2,13 @@
2
 
3
  from typing import Optional
4
 
5
- from src.models.embedding_models import load_biomedbert_model
6
- from src.data.loaders import load_annotations
7
- from src.indexing.document_indexer import build_document_index
8
- from src.indexing.embedding_creator import create_tag_embeddings, create_chunk_embeddings
9
- from src.indexing.storage import save_document_system, load_document_system
10
- from src.retrieval.document_retriever import create_document_tag_mapping, find_relevant_documents
11
- from src.retrieval.chunk_retriever import find_relevant_chunks, get_documents_for_rag, get_chunks_for_rag
12
 
13
 
14
  def build_medical_rag_system(enable_chunk_embeddings: bool = True):
 
2
 
3
  from typing import Optional
4
 
5
+ from models.embedding_models import load_biomedbert_model
6
+ from data.loaders import load_annotations
7
+ from indexing.document_indexer import build_document_index
8
+ from indexing.embedding_creator import create_tag_embeddings, create_chunk_embeddings
9
+ from indexing.storage import save_document_system, load_document_system
10
+ from retrieval.document_retriever import create_document_tag_mapping, find_relevant_documents
11
+ from retrieval.chunk_retriever import find_relevant_chunks, get_documents_for_rag, get_chunks_for_rag
12
 
13
 
14
  def build_medical_rag_system(enable_chunk_embeddings: bool = True):
src/pdf-version/generate_embeddings.py CHANGED
@@ -6,10 +6,10 @@ Quick script to generate new embeddings with sentence-based chunking
6
  import sys
7
  from pathlib import Path
8
 
9
- # Add src directory to Python path
10
- sys.path.insert(0, str(Path(__file__).parent / "src"))
11
 
12
- from src.demos.demo_runner import build_medical_rag_system
13
 
14
  def main():
15
  print("πŸš€ Starting to build medical RAG system with new sentence-based chunking...")
 
6
  import sys
7
  from pathlib import Path
8
 
9
+ # Add pdf-version directory to Python path
10
+ sys.path.insert(0, str(Path(__file__).parent))
11
 
12
+ from demos.demo_runner import build_medical_rag_system
13
 
14
  def main():
15
  print("πŸš€ Starting to build medical RAG system with new sentence-based chunking...")
src/pdf-version/indexing/document_indexer.py CHANGED
@@ -4,7 +4,7 @@ import os
4
  from typing import List, Dict
5
  from llama_index.core import Document
6
  from llama_index.core.node_parser import SentenceSplitter
7
- from src.data.pdf_processing import extract_pdf_content_enhanced
8
 
9
 
10
  def split_text_into_chunks(text: str, chunk_size: int = 256, chunk_overlap: int = 25) -> List[Dict]:
 
4
  from typing import List, Dict
5
  from llama_index.core import Document
6
  from llama_index.core.node_parser import SentenceSplitter
7
+ from data.pdf_processing import extract_pdf_content_enhanced
8
 
9
 
10
  def split_text_into_chunks(text: str, chunk_size: int = 256, chunk_overlap: int = 25) -> List[Dict]:
src/pdf-version/indexing/storage.py CHANGED
@@ -8,7 +8,7 @@ import numpy as np
8
 
9
  def save_document_system(document_index: Dict, tag_embeddings: Dict,
10
  doc_tag_mapping: Dict, chunk_embeddings: Dict = None,
11
- output_dir: str = "."):
12
  """Save the complete document indexing system.
13
 
14
  Args:
@@ -19,6 +19,15 @@ def save_document_system(document_index: Dict, tag_embeddings: Dict,
19
  output_dir: Output directory for saved files.
20
  """
21
 
 
 
 
 
 
 
 
 
 
22
  # Save document index (content + metadata + chunks)
23
  doc_index_serializable = {}
24
  for doc_name, doc_info in document_index.items():
@@ -79,7 +88,7 @@ def save_document_system(document_index: Dict, tag_embeddings: Dict,
79
  print("βœ… Document system saved to files")
80
 
81
 
82
- def load_document_system(input_dir: str = ".") -> Tuple[Optional[Dict], Optional[Dict], Optional[Dict], Optional[Dict]]:
83
  """Load the complete document indexing system.
84
 
85
  Args:
@@ -89,6 +98,12 @@ def load_document_system(input_dir: str = ".") -> Tuple[Optional[Dict], Optional
89
  Tuple of (document_index, tag_embeddings, doc_tag_mapping, chunk_embeddings).
90
  Returns (None, None, None, None) if loading fails.
91
  """
 
 
 
 
 
 
92
  try:
93
  # Load document index
94
  with open(os.path.join(input_dir, 'document_index.json'), 'r', encoding='utf-8') as f:
 
8
 
9
  def save_document_system(document_index: Dict, tag_embeddings: Dict,
10
  doc_tag_mapping: Dict, chunk_embeddings: Dict = None,
11
+ output_dir: str = None):
12
  """Save the complete document indexing system.
13
 
14
  Args:
 
19
  output_dir: Output directory for saved files.
20
  """
21
 
22
+ if output_dir is None:
23
+ # Get project root directory
24
+ from pathlib import Path
25
+ root_dir = Path(__file__).parent.parent.parent.parent
26
+ output_dir = root_dir / 'embeddings' / 'pdfembeddings'
27
+
28
+ # Ensure output directory exists
29
+ os.makedirs(output_dir, exist_ok=True)
30
+
31
  # Save document index (content + metadata + chunks)
32
  doc_index_serializable = {}
33
  for doc_name, doc_info in document_index.items():
 
88
  print("βœ… Document system saved to files")
89
 
90
 
91
+ def load_document_system(input_dir: str = None) -> Tuple[Optional[Dict], Optional[Dict], Optional[Dict], Optional[Dict]]:
92
  """Load the complete document indexing system.
93
 
94
  Args:
 
98
  Tuple of (document_index, tag_embeddings, doc_tag_mapping, chunk_embeddings).
99
  Returns (None, None, None, None) if loading fails.
100
  """
101
+ if input_dir is None:
102
+ # Get project root directory
103
+ from pathlib import Path
104
+ root_dir = Path(__file__).parent.parent.parent.parent
105
+ input_dir = root_dir / 'embeddings' / 'pdfembeddings'
106
+
107
  try:
108
  # Load document index
109
  with open(os.path.join(input_dir, 'document_index.json'), 'r', encoding='utf-8') as f:
src/pdf-version/main.py CHANGED
@@ -7,10 +7,10 @@ Main entry point for the medical RAG system.
7
  import sys
8
  from pathlib import Path
9
 
10
- # Add src directory to Python path
11
- sys.path.insert(0, str(Path(__file__).parent / "src"))
12
 
13
- from src.demos.demo_runner import build_medical_rag_system, demo_rag_query, demo_all_strategies
14
 
15
 
16
  def main():
 
7
  import sys
8
  from pathlib import Path
9
 
10
+ # Add pdf-version directory to Python path
11
+ sys.path.insert(0, str(Path(__file__).parent))
12
 
13
+ from demos.demo_runner import build_medical_rag_system, demo_rag_query, demo_all_strategies
14
 
15
 
16
  def main():
src/pdf-version/oncall_ai.py CHANGED
@@ -8,26 +8,26 @@ Import everything from the new modular structure.
8
  import sys
9
  from pathlib import Path
10
 
11
- # Add src directory to Python path
12
- sys.path.insert(0, str(Path(__file__).parent / "src"))
13
 
14
  # Import all functions for backward compatibility
15
- from src.models.embedding_models import load_biomedbert_model, load_meditron_model
16
- from src.data.loaders import load_annotations, filter_pdf_files
17
- from src.data.pdf_processing import (
18
  extract_pdf_text, extract_tables_from_pdf,
19
  extract_images_ocr_from_pdf, extract_pdf_content_enhanced
20
  )
21
- from src.indexing.document_indexer import build_document_index, split_text_into_chunks
22
- from src.indexing.embedding_creator import create_text_embedding, create_tag_embeddings, create_chunk_embeddings
23
- from src.indexing.storage import save_document_system, load_document_system
24
- from src.retrieval.document_retriever import (
25
  find_relevant_documents_top_k, find_relevant_documents_top_p,
26
  find_relevant_documents_threshold, find_relevant_documents,
27
  create_document_tag_mapping
28
  )
29
- from src.retrieval.chunk_retriever import find_relevant_chunks, get_documents_for_rag, get_chunks_for_rag
30
- from src.demos.demo_runner import build_medical_rag_system, demo_rag_query, demo_all_strategies
31
 
32
  # Main function for backward compatibility
33
  def main():
 
8
  import sys
9
  from pathlib import Path
10
 
11
+ # Add pdf-version directory to Python path
12
+ sys.path.insert(0, str(Path(__file__).parent))
13
 
14
  # Import all functions for backward compatibility
15
+ from models.embedding_models import load_biomedbert_model, load_meditron_model
16
+ from data.loaders import load_annotations, filter_pdf_files
17
+ from data.pdf_processing import (
18
  extract_pdf_text, extract_tables_from_pdf,
19
  extract_images_ocr_from_pdf, extract_pdf_content_enhanced
20
  )
21
+ from indexing.document_indexer import build_document_index, split_text_into_chunks
22
+ from indexing.embedding_creator import create_text_embedding, create_tag_embeddings, create_chunk_embeddings
23
+ from indexing.storage import save_document_system, load_document_system
24
+ from retrieval.document_retriever import (
25
  find_relevant_documents_top_k, find_relevant_documents_top_p,
26
  find_relevant_documents_threshold, find_relevant_documents,
27
  create_document_tag_mapping
28
  )
29
+ from retrieval.chunk_retriever import find_relevant_chunks, get_documents_for_rag, get_chunks_for_rag
30
+ from demos.demo_runner import build_medical_rag_system, demo_rag_query, demo_all_strategies
31
 
32
  # Main function for backward compatibility
33
  def main():
src/pdf-version/rag/medical_rag_pipeline.py CHANGED
@@ -7,10 +7,6 @@ from typing import Dict, List, Optional, Tuple
7
  from sentence_transformers import SentenceTransformer
8
 
9
  # Import existing retrieval components
10
- import sys
11
- import os
12
- sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
13
-
14
  from retrieval.document_retriever import find_relevant_documents
15
  from retrieval.chunk_retriever import find_relevant_chunks, get_chunks_for_rag
16
  from models.embedding_models import load_biomedbert_model
@@ -391,10 +387,10 @@ def answer_medical_query(query: str,
391
  return complete_result
392
 
393
 
394
- def load_rag_data(tag_embeddings_path: str = "tag_embeddings.json",
395
- chunk_embeddings_path: str = "chunk_embeddings.json",
396
- doc_tag_mapping_path: str = "document_tag_mapping.json",
397
- document_index_path: str = "document_index.json") -> Tuple[SentenceTransformer, Dict, Dict, Dict, Dict]:
398
  """
399
  Load all RAG data needed for medical question answering.
400
 
@@ -409,6 +405,28 @@ def load_rag_data(tag_embeddings_path: str = "tag_embeddings.json",
409
  """
410
  print("πŸ”„ Loading Medical RAG Data...")
411
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
412
  # Load embedding model
413
  print("πŸ“¦ Loading BGE Large Medical embedding model...")
414
  embedding_model = load_biomedbert_model()
 
7
  from sentence_transformers import SentenceTransformer
8
 
9
  # Import existing retrieval components
 
 
 
 
10
  from retrieval.document_retriever import find_relevant_documents
11
  from retrieval.chunk_retriever import find_relevant_chunks, get_chunks_for_rag
12
  from models.embedding_models import load_biomedbert_model
 
387
  return complete_result
388
 
389
 
390
+ def load_rag_data(tag_embeddings_path: str = None,
391
+ chunk_embeddings_path: str = None,
392
+ doc_tag_mapping_path: str = None,
393
+ document_index_path: str = None) -> Tuple[SentenceTransformer, Dict, Dict, Dict, Dict]:
394
  """
395
  Load all RAG data needed for medical question answering.
396
 
 
405
  """
406
  print("πŸ”„ Loading Medical RAG Data...")
407
 
408
+ # Set default paths if not provided
409
+ if tag_embeddings_path is None:
410
+ from pathlib import Path
411
+ root_dir = Path(__file__).parent.parent.parent.parent
412
+ embeddings_dir = root_dir / 'embeddings' / 'pdfembeddings'
413
+ tag_embeddings_path = embeddings_dir / 'tag_embeddings.json'
414
+ if chunk_embeddings_path is None:
415
+ from pathlib import Path
416
+ root_dir = Path(__file__).parent.parent.parent.parent
417
+ embeddings_dir = root_dir / 'embeddings' / 'pdfembeddings'
418
+ chunk_embeddings_path = embeddings_dir / 'chunk_embeddings.json'
419
+ if doc_tag_mapping_path is None:
420
+ from pathlib import Path
421
+ root_dir = Path(__file__).parent.parent.parent.parent
422
+ embeddings_dir = root_dir / 'embeddings' / 'pdfembeddings'
423
+ doc_tag_mapping_path = embeddings_dir / 'document_tag_mapping.json'
424
+ if document_index_path is None:
425
+ from pathlib import Path
426
+ root_dir = Path(__file__).parent.parent.parent.parent
427
+ embeddings_dir = root_dir / 'embeddings' / 'pdfembeddings'
428
+ document_index_path = embeddings_dir / 'document_index.json'
429
+
430
  # Load embedding model
431
  print("πŸ“¦ Loading BGE Large Medical embedding model...")
432
  embedding_model = load_biomedbert_model()