Spaces:

ybchen928
/

oncall-guide-ai

Sleeping

YanBoChen commited on 28 days ago

Commit

f3ac7d9

1 Parent(s): 8942859

Add comprehensive tests for chunk quality analysis and embedding validation

- Introduced a new test suite for chunk quality analysis, covering chunk length distribution, chunking method comparison, token vs character analysis, and recommendations generation.
- Enhanced embedding validation tests with detailed logging and checks for embedding dimensions, self-retrieval accuracy, and cross-dataset search functionality.

Files changed (3) hide show

tests/test_chunk_quality_analysis.py +333 -0
tests/test_embedding_and_index.py +96 -24
tests/test_embedding_validation.py +99 -15

tests/test_chunk_quality_analysis.py ADDED Viewed

	@@ -0,0 +1,333 @@

+"""
+Chunk Quality Analysis Tests
+This module analyzes chunk quality and identifies issues with chunk length differences
+between emergency and treatment data processing methods.
+Author: OnCall.ai Team
+Date: 2025-07-28
+"""
+import sys
+import json
+import numpy as np
+from pathlib import Path
+from typing import List, Dict, Tuple
+import logging
+# Setup logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(levelname)s:%(name)s:%(message)s'
+)
+logger = logging.getLogger(__name__)
+# Add src to python path
+current_dir = Path(__file__).parent.resolve()
+project_root = current_dir.parent
+sys.path.append(str(project_root / "src"))
+from data_processing import DataProcessor
+class TestChunkQualityAnalysis:
+    def setup_class(self):
+        """Initialize test environment"""
+        print("\n=== Phase 1: Setting up Chunk Quality Analysis ===")
+        self.base_dir = Path(__file__).parent.parent.resolve()
+        self.models_dir = self.base_dir / "models"
+        self.embeddings_dir = self.models_dir / "embeddings"
+        print(f"• Base directory: {self.base_dir}")
+        print(f"• Models directory: {self.models_dir}")
+        # Initialize processor
+        self.processor = DataProcessor(base_dir=str(self.base_dir))
+        print("• DataProcessor initialized")
+    def test_chunk_length_analysis(self):
+        """Detailed analysis of chunk length distribution"""
+        print("\n=== Phase 2: Chunk Length Distribution Analysis ===")
+        try:
+            # Load chunk data
+            print("• Loading chunk data...")
+            with open(self.embeddings_dir / "emergency_chunks.json", 'r') as f:
+                emergency_chunks = json.load(f)
+            with open(self.embeddings_dir / "treatment_chunks.json", 'r') as f:
+                treatment_chunks = json.load(f)
+            # Analyze emergency chunks
+            em_lengths = [len(chunk['text']) for chunk in emergency_chunks]
+            em_token_counts = [chunk.get('token_count', 0) for chunk in emergency_chunks]
+            print(f"\n📊 Emergency Chunks Analysis:")
+            print(f"• Total chunks: {len(em_lengths):,}")
+            print(f"• Min length: {min(em_lengths)} chars")
+            print(f"• Max length: {max(em_lengths)} chars")
+            print(f"• Average length: {sum(em_lengths)/len(em_lengths):.2f} chars")
+            print(f"• Median length: {sorted(em_lengths)[len(em_lengths)//2]} chars")
+            if any(em_token_counts):
+                avg_tokens = sum(em_token_counts)/len(em_token_counts)
+                print(f"• Average tokens: {avg_tokens:.2f}")
+                print(f"• Chars per token ratio: {(sum(em_lengths)/len(em_lengths)) / avg_tokens:.2f}")
+            # Analyze treatment chunks
+            tr_lengths = [len(chunk['text']) for chunk in treatment_chunks]
+            print(f"\n📊 Treatment Chunks Analysis:")
+            print(f"• Total chunks: {len(tr_lengths):,}")
+            print(f"• Min length: {min(tr_lengths)} chars")
+            print(f"• Max length: {max(tr_lengths)} chars")
+            print(f"• Average length: {sum(tr_lengths)/len(tr_lengths):.2f} chars")
+            print(f"• Median length: {sorted(tr_lengths)[len(tr_lengths)//2]} chars")
+            # Length distribution comparison
+            em_avg = sum(em_lengths)/len(em_lengths)
+            tr_avg = sum(tr_lengths)/len(tr_lengths)
+            ratio = em_avg / tr_avg
+            print(f"\n🔍 Length Distribution Comparison:")
+            print(f"• Emergency average: {em_avg:.0f} chars")
+            print(f"• Treatment average: {tr_avg:.0f} chars")
+            print(f"• Ratio (Emergency/Treatment): {ratio:.1f}x")
+            # Length distribution buckets
+            print(f"\n📈 Length Distribution Buckets:")
+            buckets = [0, 100, 250, 500, 1000, 2000, 5000]
+            for i in range(len(buckets)-1):
+                em_count = sum(1 for l in em_lengths if buckets[i] <= l < buckets[i+1])
+                tr_count = sum(1 for l in tr_lengths if buckets[i] <= l < buckets[i+1])
+                print(f"• {buckets[i]}-{buckets[i+1]} chars: Emergency={em_count}, Treatment={tr_count}")
+            # Flag potential issues
+            if ratio > 5.0:
+                print(f"\n⚠️  WARNING: Emergency chunks are {ratio:.1f}x longer than treatment chunks!")
+                print("   This suggests different chunking strategies are being used.")
+            print("✅ Chunk length analysis completed")
+        except Exception as e:
+            print(f"❌ Error in chunk length analysis: {str(e)}")
+            raise
+    def test_chunking_method_comparison(self):
+        """Compare the two chunking methods on the same data"""
+        print("\n=== Phase 3: Chunking Method Comparison ===")
+        try:
+            # Load data
+            print("• Loading dataset for comparison...")
+            self.processor.load_filtered_data()
+            # Test on multiple samples for better analysis
+            sample_size = 5
+            samples = self.processor.treatment_data.head(sample_size)
+            method1_results = []  # keyword_centered_chunks
+            method2_results = []  # dual_keyword_chunks
+            print(f"• Testing {sample_size} samples with both methods...")
+            for idx, row in samples.iterrows():
+                if not row.get('clean_text') or not row.get('treatment_matched'):
+                    continue
+                text_length = len(row['clean_text'])
+                emergency_kw = row.get('matched', '')
+                treatment_kw = row['treatment_matched']
+                # Method 1: keyword_centered_chunks (Emergency method)
+                chunks1 = self.processor.create_keyword_centered_chunks(
+                    text=row['clean_text'],
+                    matched_keywords=emergency_kw,
+                    chunk_size=256,
+                    doc_id=f"test_{idx}"
+                )
+                # Method 2: dual_keyword_chunks (Treatment method)
+                chunks2 = self.processor.create_dual_keyword_chunks(
+                    text=row['clean_text'],
+                    emergency_keywords=emergency_kw,
+                    treatment_keywords=treatment_kw,
+                    chunk_size=256,
+                    doc_id=f"test_{idx}"
+                )
+                # Collect results
+                if chunks1:
+                    avg_len1 = sum(len(c['text']) for c in chunks1) / len(chunks1)
+                    method1_results.append({
+                        'doc_id': idx,
+                        'chunks_count': len(chunks1),
+                        'avg_length': avg_len1,
+                        'text_length': text_length
+                    })
+                if chunks2:
+                    avg_len2 = sum(len(c['text']) for c in chunks2) / len(chunks2)
+                    method2_results.append({
+                        'doc_id': idx,
+                        'chunks_count': len(chunks2),
+                        'avg_length': avg_len2,
+                        'text_length': text_length
+                    })
+            # Analysis results
+            print(f"\n📊 Method Comparison Results:")
+            if method1_results:
+                avg_chunks1 = sum(r['chunks_count'] for r in method1_results) / len(method1_results)
+                avg_len1 = sum(r['avg_length'] for r in method1_results) / len(method1_results)
+                print(f"\n🔹 Keyword-Centered Method (Emergency):")
+                print(f"• Average chunks per document: {avg_chunks1:.1f}")
+                print(f"• Average chunk length: {avg_len1:.0f} chars")
+            if method2_results:
+                avg_chunks2 = sum(r['chunks_count'] for r in method2_results) / len(method2_results)
+                avg_len2 = sum(r['avg_length'] for r in method2_results) / len(method2_results)
+                print(f"\n🔹 Dual-Keyword Method (Treatment):")
+                print(f"• Average chunks per document: {avg_chunks2:.1f}")
+                print(f"• Average chunk length: {avg_len2:.0f} chars")
+                if method1_results:
+                    ratio = avg_len1 / avg_len2
+                    print(f"\n🔍 Length Ratio: {ratio:.1f}x (Method1 / Method2)")
+            print("✅ Chunking method comparison completed")
+        except Exception as e:
+            print(f"❌ Error in method comparison: {str(e)}")
+            raise
+    def test_token_vs_character_analysis(self):
+        """Analyze token vs character differences in chunking"""
+        print("\n=== Phase 4: Token vs Character Analysis ===")
+        try:
+            # Load model for tokenization
+            print("• Loading embedding model for tokenization...")
+            self.processor.load_embedding_model()
+            # Test sample texts
+            test_texts = [
+                "Patient presents with acute chest pain and shortness of breath.",
+                "Emergency treatment for myocardial infarction includes immediate medication.",
+                "The patient's vital signs show tachycardia and hypotension requiring intervention."
+            ]
+            print(f"\n📊 Token vs Character Analysis:")
+            total_chars = 0
+            total_tokens = 0
+            for i, text in enumerate(test_texts, 1):
+                char_count = len(text)
+                token_count = len(self.processor.tokenizer.tokenize(text))
+                ratio = char_count / token_count if token_count > 0 else 0
+                print(f"\nSample {i}:")
+                print(f"• Text: {text[:50]}...")
+                print(f"• Characters: {char_count}")
+                print(f"• Tokens: {token_count}")
+                print(f"• Chars/Token ratio: {ratio:.2f}")
+                total_chars += char_count
+                total_tokens += token_count
+            overall_ratio = total_chars / total_tokens
+            print(f"\n🔍 Overall Character/Token Ratio: {overall_ratio:.2f}")
+            # Estimate chunk sizes
+            target_tokens = 256
+            estimated_chars = target_tokens * overall_ratio
+            print(f"\n📏 Chunk Size Estimates:")
+            print(f"• Target tokens: {target_tokens}")
+            print(f"• Estimated characters: {estimated_chars:.0f}")
+            print(f"• Current emergency avg: 1842 chars ({1842/overall_ratio:.0f} estimated tokens)")
+            print(f"• Current treatment avg: 250 chars ({250/overall_ratio:.0f} estimated tokens)")
+            # Recommendations
+            print(f"\n💡 Recommendations:")
+            if 1842/overall_ratio > 512:
+                print("⚠️  Emergency chunks may exceed model's 512 token limit!")
+            if 250/overall_ratio < 64:
+                print("⚠️  Treatment chunks may be too short for meaningful context!")
+            print("✅ Token vs character analysis completed")
+        except Exception as e:
+            print(f"❌ Error in token analysis: {str(e)}")
+            raise
+    def test_generate_recommendations(self):
+        """Generate recommendations based on analysis"""
+        print("\n=== Phase 5: Generating Recommendations ===")
+        recommendations = []
+        # Based on the known chunk length difference
+        recommendations.append({
+            'issue': 'Inconsistent chunk lengths',
+            'description': 'Emergency chunks (1842 chars) are 7x longer than treatment chunks (250 chars)',
+            'recommendation': 'Standardize both methods to use token-based chunking with consistent parameters',
+            'priority': 'HIGH'
+        })
+        recommendations.append({
+            'issue': 'Different chunking strategies',
+            'description': 'Emergency uses keyword-centered (token-based), Treatment uses dual-keyword (character-based)',
+            'recommendation': 'Update dual_keyword_chunks to use tokenizer for consistent token-based chunking',
+            'priority': 'HIGH'
+        })
+        recommendations.append({
+            'issue': 'Potential token limit overflow',
+            'description': 'Large chunks may exceed PubMedBERT 512 token limit',
+            'recommendation': 'Implement strict token-based chunking with overlap to prevent overflow',
+            'priority': 'MEDIUM'
+        })
+        print(f"\n📋 Analysis Recommendations:")
+        for i, rec in enumerate(recommendations, 1):
+            print(f"\n{i}. {rec['issue']} [{rec['priority']}]")
+            print(f"   Problem: {rec['description']}")
+            print(f"   Solution: {rec['recommendation']}")
+        print("\n✅ Recommendations generated")
+        return recommendations
+def main():
+    """Run all chunk quality analysis tests"""
+    print("\n" + "="*60)
+    print("CHUNK QUALITY ANALYSIS TEST SUITE")
+    print("="*60)
+    test = TestChunkQualityAnalysis()
+    test.setup_class()
+    try:
+        test.test_chunk_length_analysis()
+        test.test_chunking_method_comparison()
+        test.test_token_vs_character_analysis()
+        recommendations = test.test_generate_recommendations()
+        print("\n" + "="*60)
+        print("🎉 ALL CHUNK QUALITY TESTS COMPLETED SUCCESSFULLY!")
+        print("="*60)
+        print(f"\nKey Finding: Chunk length inconsistency detected!")
+        print(f"Emergency: ~1842 chars, Treatment: ~250 chars (7x difference)")
+        print(f"Recommendation: Standardize to token-based chunking")
+        print("="*60)
+    except Exception as e:
+        print("\n" + "="*60)
+        print("❌ CHUNK QUALITY TESTS FAILED!")
+        print(f"Error: {str(e)}")
+        print("="*60)
+if __name__ == "__main__":
+    main()

tests/test_embedding_and_index.py CHANGED Viewed

@@ -1,29 +1,101 @@
 import numpy as np
 from annoy import AnnoyIndex
 import pytest
 from data_processing import DataProcessor
-@pytest.fixture(scope="module")
-def processor():
-    return DataProcessor(base_dir=".")
-def test_embedding_dimensions(processor):
-    # load emergency embeddings
-    emb = np.load(processor.models_dir / "embeddings" / "emergency_embeddings.npy")
-    expected_dim = processor.embedding_dim
-    assert emb.ndim == 2, f"Expected 2D array, got {emb.ndim}D"
-    assert emb.shape[1] == expected_dim, (
-        f"Expected embedding dimension {expected_dim}, got {emb.shape[1]}"
-    )
-def test_annoy_search(processor):
-    # load embeddings
-    emb = np.load(processor.models_dir / "embeddings" / "emergency_embeddings.npy")
-    # load Annoy index
-    idx = AnnoyIndex(processor.embedding_dim, 'angular')
-    idx.load(str(processor.models_dir / "indices" / "annoy" / "emergency_index.ann"))
-    # perform a sample query
-    query_vec = emb[0]
-    ids, distances = idx.get_nns_by_vector(query_vec, 5, include_distances=True)
-    assert len(ids) == 5
-    assert all(0 <= d <= 2 for d in distances)

+"""
+Basic embedding and index validation tests
+"""
+# 2025-07-28
+import sys
+from pathlib import Path
+#
 import numpy as np
 from annoy import AnnoyIndex
 import pytest
+print("\n=== Phase 1: Initializing Test Environment ===")
+# add src to python path
+current_dir = Path(__file__).parent.resolve()
+project_root = current_dir.parent
+sys.path.append(str(project_root / "src"))
+print(f"• Current directory: {current_dir}")
+print(f"• Project root: {project_root}")
+print(f"• Python path: {sys.path}")
 from data_processing import DataProcessor
+class TestEmbeddingAndIndex:
+    def setup_class(self):
+        """初始化測試類"""
+        print("\n=== Phase 2: Setting up TestEmbeddingAndIndex ===")
+        self.base_dir = Path(__file__).parent.parent.resolve()
+        print(f"• Base directory: {self.base_dir}")
+        self.processor = DataProcessor(base_dir=str(self.base_dir))
+        print("• DataProcessor initialized")
+    def test_embedding_dimensions(self):
+        print("\n=== Phase 3: Testing Embedding Dimensions ===")
+        print("• Loading emergency embeddings...")
+        # load emergency embeddings
+        emb = np.load(self.processor.models_dir / "embeddings" / "emergency_embeddings.npy")
+        expected_dim = self.processor.embedding_dim
+        print(f"• Loaded embedding shape: {emb.shape}")
+        print(f"• Expected dimension: {expected_dim}")
+        assert emb.ndim == 2, f"Expected 2D array, got {emb.ndim}D"
+        assert emb.shape[1] == expected_dim, (
+            f"Expected embedding dimension {expected_dim}, got {emb.shape[1]}"
+        )
+        print("✅ Embedding dimensions test passed")
+    def test_annoy_search(self):
+        print("\n=== Phase 4: Testing Annoy Search ===")
+        print("• Loading embeddings...")
+        # load embeddings
+        emb = np.load(self.processor.models_dir / "embeddings" / "emergency_embeddings.npy")
+        print(f"• Loaded embeddings shape: {emb.shape}")
+        print("• Loading Annoy index...")
+        # load Annoy index
+        idx = AnnoyIndex(self.processor.embedding_dim, 'angular')
+        index_path = self.processor.models_dir / "indices" / "annoy" / "emergency_index.ann"
+        print(f"• Index path: {index_path}")
+        idx.load(str(index_path))
+        print("• Performing sample query...")
+        # perform a sample query
+        query_vec = emb[0]
+        ids, distances = idx.get_nns_by_vector(query_vec, 5, include_distances=True)
+        print(f"• Search results:")
+        print(f"  - Found IDs: {ids}")
+        print(f"  - Distances: {[f'{d:.4f}' for d in distances]}")
+        assert len(ids) == 5, f"Expected 5 results, got {len(ids)}"
+        assert all(0 <= d <= 2 for d in distances), "Invalid distance values"
+        print("✅ Annoy search test passed")
+def main():
+    """Run tests manually"""
+    print("\n" + "="*50)
+    print("Starting Embedding and Index Tests")
+    print("="*50)
+    test = TestEmbeddingAndIndex()
+    test.setup_class()  # 手動初始化
+    try:
+        test.test_embedding_dimensions()
+        test.test_annoy_search()
+        print("\n" + "="*50)
+        print("🎉 All tests completed successfully!")
+        print("="*50)
+    except Exception as e:
+        print("\n" + "="*50)
+        print("❌ Tests failed!")
+        print(f"Error: {str(e)}")
+        print("="*50)
+if __name__ == "__main__":
+    main()

tests/test_embedding_validation.py CHANGED Viewed

@@ -7,14 +7,27 @@ import numpy as np
 import json
 import logging
 import os
 from pathlib import Path
 from typing import Tuple, List, Optional
 from annoy import AnnoyIndex
 from sentence_transformers import SentenceTransformer
 class TestEmbeddingValidation:
     def setup_class(self):
         """Initialize test environment with necessary data and models."""
         # Setup logging
         logging.basicConfig(
             level=logging.DEBUG,
@@ -24,43 +37,57 @@ class TestEmbeddingValidation:
         self.logger = logging.getLogger(__name__)
         # Define base paths
-        self.project_root = Path(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
         self.models_dir = self.project_root / "models"
         self.embeddings_dir = self.models_dir / "embeddings"
         self.indices_dir = self.models_dir / "indices" / "annoy"
         self.logger.info(f"Project root: {self.project_root}")
         self.logger.info(f"Models directory: {self.models_dir}")
         self.logger.info(f"Embeddings directory: {self.embeddings_dir}")
         try:
             # Check directory existence
             if not self.embeddings_dir.exists():
                 raise FileNotFoundError(f"Embeddings directory not found at: {self.embeddings_dir}")
             if not self.indices_dir.exists():
                 raise FileNotFoundError(f"Indices directory not found at: {self.indices_dir}")
             # Load embeddings
             self.emergency_emb = np.load(self.embeddings_dir / "emergency_embeddings.npy")
             self.treatment_emb = np.load(self.embeddings_dir / "treatment_embeddings.npy")
             # Load chunks
             with open(self.embeddings_dir / "emergency_chunks.json", 'r') as f:
                 self.emergency_chunks = json.load(f)
             with open(self.embeddings_dir / "treatment_chunks.json", 'r') as f:
                 self.treatment_chunks = json.load(f)
             # Initialize model
             self.model = SentenceTransformer("NeuML/pubmedbert-base-embeddings")
             self.logger.info("Test environment initialized successfully")
             self.logger.info(f"Emergency embeddings shape: {self.emergency_emb.shape}")
             self.logger.info(f"Treatment embeddings shape: {self.treatment_emb.shape}")
         except FileNotFoundError as e:
             self.logger.error(f"File not found: {e}")
             raise
         except Exception as e:
             self.logger.error(f"Error during initialization: {e}")
             raise
@@ -84,20 +111,28 @@ class TestEmbeddingValidation:
     def test_embedding_dimensions(self):
         """Test embedding dimensions and data quality."""
         self.logger.info("\n=== Embedding Validation Report ===")
         try:
             # Basic dimension checks
             assert self.emergency_emb.shape[1] == 768, "Emergency embedding dimension should be 768"
             assert self.treatment_emb.shape[1] == 768, "Treatment embedding dimension should be 768"
             # Count verification
             assert len(self.emergency_chunks) == self.emergency_emb.shape[0], \
                 "Emergency chunks count mismatch"
             assert len(self.treatment_chunks) == self.treatment_emb.shape[0], \
                 "Treatment chunks count mismatch"
             # Data quality checks
             for name, emb in [("Emergency", self.emergency_emb),
                              ("Treatment", self.treatment_emb)]:
                 # Check for NaN and Inf
@@ -105,25 +140,35 @@ class TestEmbeddingValidation:
                 assert not np.isinf(emb).any(), f"{name} contains Inf values"
                 # Value distribution analysis
                 self.logger.info(f"\n{name} Embeddings Statistics:")
                 self.logger.info(f"- Range: {np.min(emb):.3f} to {np.max(emb):.3f}")
                 self.logger.info(f"- Mean: {np.mean(emb):.3f}")
                 self.logger.info(f"- Std: {np.std(emb):.3f}")
             self.logger.info("\n✅ All embedding validations passed")
         except AssertionError as e:
             self.logger.error(f"Validation failed: {str(e)}")
             raise
     def test_multiple_known_item_search(self):
         """Test ANNOY search with multiple random samples."""
         self.logger.info("\n=== Multiple Known-Item Search Test ===")
         emergency_index = AnnoyIndex(768, 'angular')
         emergency_index.load(str(self.indices_dir / "emergency_index.ann"))
         # Test 20 random samples
         test_indices = np.random.choice(
             self.emergency_emb.shape[0],
             size=20,
@@ -131,36 +176,45 @@ class TestEmbeddingValidation:
         )
         success_count = 0
-        for test_idx in test_indices:
             try:
                 test_emb = self.emergency_emb[test_idx]
                 indices, distances = self._safe_search(emergency_index, test_emb)
                 if indices is None:
                     continue
                 # Verify self-retrieval
                 assert indices[0] == test_idx, f"Self-retrieval failed for index {test_idx}"
                 assert distances[0] < 0.0001, f"Self-distance too large for index {test_idx}"
                 success_count += 1
             except AssertionError as e:
                 self.logger.warning(f"Test failed for index {test_idx}: {str(e)}")
         self.logger.info(f"\n✅ {success_count}/20 self-retrieval tests passed")
         assert success_count >= 18, "Less than 90% of self-retrieval tests passed"
     def test_balanced_cross_dataset_search(self):
         """Test search across both emergency and treatment datasets."""
         self.logger.info("\n=== Balanced Cross-Dataset Search Test ===")
         # Initialize indices
         emergency_index = AnnoyIndex(768, 'angular')
         treatment_index = AnnoyIndex(768, 'angular')
         try:
             emergency_index.load(str(self.indices_dir / "emergency_index.ann"))
             treatment_index.load(str(self.indices_dir / "treatment_index.ann"))
             # Test queries
             test_queries = [
@@ -169,45 +223,75 @@ class TestEmbeddingValidation:
                 "What are the emergency procedures for anaphylactic shock?"
             ]
-            for query in test_queries:
-                print(f"\n\n=== Query: {query} ===")
                 # Generate query vector
                 query_emb = self.model.encode([query])[0]
                 # Get top-5 results from each dataset
                 e_indices, e_distances = self._safe_search(emergency_index, query_emb, k=5)
                 t_indices, t_distances = self._safe_search(treatment_index, query_emb, k=5)
                 if None in [e_indices, e_distances, t_indices, t_distances]:
                     self.logger.error("Search failed for one or both datasets")
                     continue
                 # Print first sentence of each result
-                print("\nEmergency Dataset Results:")
                 for i, (idx, dist) in enumerate(zip(e_indices, e_distances), 1):
                     text = self.emergency_chunks[idx]['text']
                     first_sentence = text.split('.')[0] + '.'
-                    print(f"\nE-{i} (distance: {dist:.3f}):")
-                    print(first_sentence)
-                print("\nTreatment Dataset Results:")
                 for i, (idx, dist) in enumerate(zip(t_indices, t_distances), 1):
                     text = self.treatment_chunks[idx]['text']
                     first_sentence = text.split('.')[0] + '.'
-                    print(f"\nT-{i} (distance: {dist:.3f}):")
-                    print(first_sentence)
         except Exception as e:
             self.logger.error(f"Test failed: {str(e)}")
             raise
         else:
             self.logger.info("\n✅ Cross-dataset search test completed")
-if __name__ == "__main__":
-    # Manual test execution
     test = TestEmbeddingValidation()
     test.setup_class()
-    test.test_embedding_dimensions()
-    test.test_multiple_known_item_search()
-    test.test_balanced_cross_dataset_search()

 import json
 import logging
 import os
+import sys
 from pathlib import Path
 from typing import Tuple, List, Optional
 from annoy import AnnoyIndex
 from sentence_transformers import SentenceTransformer
+print("\n=== Phase 1: Initializing Test Environment ===")
+# Add src to python path
+current_dir = Path(__file__).parent.resolve()
+project_root = current_dir.parent
+sys.path.append(str(project_root / "src"))
+print(f"• Current directory: {current_dir}")
+print(f"• Project root: {project_root}")
+print(f"• Python path added: {project_root / 'src'}")
 class TestEmbeddingValidation:
     def setup_class(self):
         """Initialize test environment with necessary data and models."""
+        print("\n=== Phase 2: Setting up Test Environment ===")
         # Setup logging
         logging.basicConfig(
             level=logging.DEBUG,
         self.logger = logging.getLogger(__name__)
         # Define base paths
+        self.project_root = Path(__file__).parent.parent.resolve()
         self.models_dir = self.project_root / "models"
         self.embeddings_dir = self.models_dir / "embeddings"
         self.indices_dir = self.models_dir / "indices" / "annoy"
+        print(f"• Project root: {self.project_root}")
+        print(f"• Models directory: {self.models_dir}")
+        print(f"• Embeddings directory: {self.embeddings_dir}")
         self.logger.info(f"Project root: {self.project_root}")
         self.logger.info(f"Models directory: {self.models_dir}")
         self.logger.info(f"Embeddings directory: {self.embeddings_dir}")
         try:
             # Check directory existence
+            print("• Checking directory existence...")
             if not self.embeddings_dir.exists():
                 raise FileNotFoundError(f"Embeddings directory not found at: {self.embeddings_dir}")
             if not self.indices_dir.exists():
                 raise FileNotFoundError(f"Indices directory not found at: {self.indices_dir}")
             # Load embeddings
+            print("• Loading embeddings...")
             self.emergency_emb = np.load(self.embeddings_dir / "emergency_embeddings.npy")
             self.treatment_emb = np.load(self.embeddings_dir / "treatment_embeddings.npy")
             # Load chunks
+            print("• Loading chunk metadata...")
             with open(self.embeddings_dir / "emergency_chunks.json", 'r') as f:
                 self.emergency_chunks = json.load(f)
             with open(self.embeddings_dir / "treatment_chunks.json", 'r') as f:
                 self.treatment_chunks = json.load(f)
             # Initialize model
+            print("• Loading PubMedBERT model...")
             self.model = SentenceTransformer("NeuML/pubmedbert-base-embeddings")
+            print(f"• Emergency embeddings shape: {self.emergency_emb.shape}")
+            print(f"• Treatment embeddings shape: {self.treatment_emb.shape}")
+            print("✅ Test environment initialized successfully")
             self.logger.info("Test environment initialized successfully")
             self.logger.info(f"Emergency embeddings shape: {self.emergency_emb.shape}")
             self.logger.info(f"Treatment embeddings shape: {self.treatment_emb.shape}")
         except FileNotFoundError as e:
+            print(f"❌ File not found: {e}")
             self.logger.error(f"File not found: {e}")
             raise
         except Exception as e:
+            print(f"❌ Error during initialization: {e}")
             self.logger.error(f"Error during initialization: {e}")
             raise
     def test_embedding_dimensions(self):
         """Test embedding dimensions and data quality."""
+        print("\n=== Phase 3: Embedding Validation ===")
         self.logger.info("\n=== Embedding Validation Report ===")
         try:
             # Basic dimension checks
+            print("• Checking embedding dimensions...")
             assert self.emergency_emb.shape[1] == 768, "Emergency embedding dimension should be 768"
             assert self.treatment_emb.shape[1] == 768, "Treatment embedding dimension should be 768"
+            print(f"✓ Emergency dimensions: {self.emergency_emb.shape}")
+            print(f"✓ Treatment dimensions: {self.treatment_emb.shape}")
             # Count verification
+            print("• Verifying chunk count consistency...")
             assert len(self.emergency_chunks) == self.emergency_emb.shape[0], \
                 "Emergency chunks count mismatch"
             assert len(self.treatment_chunks) == self.treatment_emb.shape[0], \
                 "Treatment chunks count mismatch"
+            print(f"✓ Emergency: {len(self.emergency_chunks)} chunks = {self.emergency_emb.shape[0]} embeddings")
+            print(f"✓ Treatment: {len(self.treatment_chunks)} chunks = {self.treatment_emb.shape[0]} embeddings")
             # Data quality checks
+            print("• Performing data quality checks...")
             for name, emb in [("Emergency", self.emergency_emb),
                              ("Treatment", self.treatment_emb)]:
                 # Check for NaN and Inf
                 assert not np.isinf(emb).any(), f"{name} contains Inf values"
                 # Value distribution analysis
+                print(f"\n📊 {name} Embeddings Statistics:")
+                print(f"• Range: {np.min(emb):.3f} to {np.max(emb):.3f}")
+                print(f"• Mean: {np.mean(emb):.3f}")
+                print(f"• Std: {np.std(emb):.3f}")
                 self.logger.info(f"\n{name} Embeddings Statistics:")
                 self.logger.info(f"- Range: {np.min(emb):.3f} to {np.max(emb):.3f}")
                 self.logger.info(f"- Mean: {np.mean(emb):.3f}")
                 self.logger.info(f"- Std: {np.std(emb):.3f}")
+            print("\n✅ All embedding validations passed")
             self.logger.info("\n✅ All embedding validations passed")
         except AssertionError as e:
+            print(f"❌ Validation failed: {str(e)}")
             self.logger.error(f"Validation failed: {str(e)}")
             raise
     def test_multiple_known_item_search(self):
         """Test ANNOY search with multiple random samples."""
+        print("\n=== Phase 4: Multiple Known-Item Search Test ===")
         self.logger.info("\n=== Multiple Known-Item Search Test ===")
+        print("• Loading emergency index...")
         emergency_index = AnnoyIndex(768, 'angular')
         emergency_index.load(str(self.indices_dir / "emergency_index.ann"))
         # Test 20 random samples
+        print("• Selecting 20 random samples for self-retrieval test...")
         test_indices = np.random.choice(
             self.emergency_emb.shape[0],
             size=20,
         )
         success_count = 0
+        print("• Testing self-retrieval for each sample...")
+        for i, test_idx in enumerate(test_indices, 1):
             try:
                 test_emb = self.emergency_emb[test_idx]
                 indices, distances = self._safe_search(emergency_index, test_emb)
                 if indices is None:
+                    print(f"  {i}/20: ❌ Search failed for index {test_idx}")
                     continue
                 # Verify self-retrieval
                 assert indices[0] == test_idx, f"Self-retrieval failed for index {test_idx}"
                 assert distances[0] < 0.0001, f"Self-distance too large for index {test_idx}"
                 success_count += 1
+                print(f"  {i}/20: ✓ Index {test_idx} (distance: {distances[0]:.6f})")
             except AssertionError as e:
+                print(f"  {i}/20: ❌ Index {test_idx} failed: {str(e)}")
                 self.logger.warning(f"Test failed for index {test_idx}: {str(e)}")
+        print(f"\n📊 Self-Retrieval Results: {success_count}/20 tests passed ({success_count/20*100:.1f}%)")
         self.logger.info(f"\n✅ {success_count}/20 self-retrieval tests passed")
         assert success_count >= 18, "Less than 90% of self-retrieval tests passed"
+        print("✅ Multiple known-item search test passed")
     def test_balanced_cross_dataset_search(self):
         """Test search across both emergency and treatment datasets."""
+        print("\n=== Phase 5: Cross-Dataset Search Test ===")
         self.logger.info("\n=== Balanced Cross-Dataset Search Test ===")
         # Initialize indices
+        print("• Loading ANNOY indices...")
         emergency_index = AnnoyIndex(768, 'angular')
         treatment_index = AnnoyIndex(768, 'angular')
         try:
             emergency_index.load(str(self.indices_dir / "emergency_index.ann"))
             treatment_index.load(str(self.indices_dir / "treatment_index.ann"))
+            print("✓ Emergency and treatment indices loaded")
             # Test queries
             test_queries = [
                 "What are the emergency procedures for anaphylactic shock?"
             ]
+            print(f"• Testing {len(test_queries)} medical queries...")
+            for query_num, query in enumerate(test_queries, 1):
+                print(f"\n🔍 Query {query_num}/3: {query}")
                 # Generate query vector
+                print("• Generating query embedding...")
                 query_emb = self.model.encode([query])[0]
                 # Get top-5 results from each dataset
+                print("• Searching both datasets...")
                 e_indices, e_distances = self._safe_search(emergency_index, query_emb, k=5)
                 t_indices, t_distances = self._safe_search(treatment_index, query_emb, k=5)
                 if None in [e_indices, e_distances, t_indices, t_distances]:
+                    print("❌ Search failed for one or both datasets")
                     self.logger.error("Search failed for one or both datasets")
                     continue
                 # Print first sentence of each result
+                print(f"\n📋 Emergency Dataset Results:")
                 for i, (idx, dist) in enumerate(zip(e_indices, e_distances), 1):
                     text = self.emergency_chunks[idx]['text']
                     first_sentence = text.split('.')[0] + '.'
+                    print(f"  E-{i} (distance: {dist:.3f}): {first_sentence[:80]}...")
+                print(f"\n📋 Treatment Dataset Results:")
                 for i, (idx, dist) in enumerate(zip(t_indices, t_distances), 1):
                     text = self.treatment_chunks[idx]['text']
                     first_sentence = text.split('.')[0] + '.'
+                    print(f"  T-{i} (distance: {dist:.3f}): {first_sentence[:80]}...")
+                print("✓ Query completed")
         except Exception as e:
+            print(f"❌ Test failed: {str(e)}")
             self.logger.error(f"Test failed: {str(e)}")
             raise
         else:
+            print("\n✅ Cross-dataset search test completed")
             self.logger.info("\n✅ Cross-dataset search test completed")
+def main():
+    """Run all embedding validation tests"""
+    print("\n" + "="*60)
+    print("COMPREHENSIVE EMBEDDING VALIDATION TEST SUITE")
+    print("="*60)
     test = TestEmbeddingValidation()
     test.setup_class()
+    try:
+        test.test_embedding_dimensions()
+        test.test_multiple_known_item_search()
+        test.test_balanced_cross_dataset_search()
+        print("\n" + "="*60)
+        print("🎉 ALL EMBEDDING VALIDATION TESTS COMPLETED SUCCESSFULLY!")
+        print("="*60)
+        print("✅ Embedding dimensions validated")
+        print("✅ Self-retrieval accuracy confirmed")
+        print("✅ Cross-dataset search functionality verified")
+        print("="*60)
+    except Exception as e:
+        print("\n" + "="*60)
+        print("❌ EMBEDDING VALIDATION TESTS FAILED!")
+        print(f"Error: {str(e)}")
+        print("="*60)
+if __name__ == "__main__":
+    main()