Spaces:

ybchen928
/

oncall-guide-ai

Sleeping

App Files Files Community

Yan-Bo Chen commited on 29 days ago

Commit

b85d6ac

2 Parent(s): cd2cfdd 985c260

Merge pull request #2 from YanBoChen0928/embedding

Browse files

### 🔧 Git History Cleanup: Removed Large Files + Forced Push

Files changed (14) hide show

.gitignore +29 -5
dataset/analysis/integrity_check/integrity_check_report.json +0 -29
dataset/analysis/keyword_matching_test_results.json +0 -151
dataset/analysis/stats/analysis_stats_emergency_subset.json +0 -55
dataset/analysis/stats/analysis_stats_emergency_subset_opt.json +0 -55
dataset/analysis_treatment/stats/treatment_analysis_comprehensive.json +0 -293
dataset/scripts/data_explorer_treatment.py +1 -1
requirements.txt +1 -0
src/__init__.py +8 -0
src/data_processing.py +698 -0
tests/embedding_test_analysis.md +355 -0
tests/test_data_processing.py +228 -0
tests/test_embedding_and_index.py +29 -0
tests/test_embedding_validation.py +213 -0

.gitignore CHANGED Viewed

@@ -1,10 +1,34 @@
-dataset/dataset/
-#virtual environment
 genAIvenv/
 .final_project_env/
-.DS_Store
-docs/

+# 🧠 Virtual environments
 genAIvenv/
 .final_project_env/
+# 💻 OS / Editor garbage
+.DS_Store
+.vscode/
+# 📁 Documentation and project folders
+docs/
+dataset/dataset/
+# 🧾 Compiled / output files
+*.pyc
+*.log
+*.zip
+*.tar.gz
+*.mp4
+*.mov
+*.json
+*.png
+# 🚫 Large files - models
+models/cache/
+models/cache/*.pkl
+models/embeddings/*.npy
+models/embeddings/*.json
+models/indices/
+models/indices/annoy/*.ann
+# 🚫 Redundant catch-all for large file extensions
+*.pkl
+*.npy
+*.ann

dataset/analysis/integrity_check/integrity_check_report.json DELETED Viewed

@@ -1,29 +0,0 @@
-{
-  "sample_analysis": {
-    "matched": {
-      "non_null": 100,
-      "non_empty": 100,
-      "unique_values": 84
-    },
-    "treatment_matched": {
-      "non_null": 100,
-      "non_empty": 100,
-      "unique_values": 100
-    }
-  },
-  "full_file_analysis": {
-    "total_records": 9367,
-    "matched_column": {
-      "non_null_count": 9367,
-      "non_empty_count": 9367,
-      "null_percentage": 0.0
-    },
-    "treatment_matched_column": {
-      "non_null_count": 9367,
-      "non_empty_count": 9367,
-      "null_percentage": 0.0
-    },
-    "both_matched_count": 3315,
-    "both_matched_percentage": 35.39019963702359
-  }
-}

dataset/analysis/keyword_matching_test_results.json DELETED Viewed

@@ -1,151 +0,0 @@
-{
-  "special_terms_matching": [
-    {
-      "clean_text": "Patient needs an x-ray of the chest",
-      "category": "x-ray variants",
-      "matched": "x-ray"
-    },
-    {
-      "clean_text": "Ordered chest xray",
-      "category": "x-ray variants",
-      "matched": "xray"
-    },
-    {
-      "clean_text": "X ray shows pneumonia",
-      "category": "x-ray variants",
-      "matched": "X ray"
-    },
-    {
-      "clean_text": "XRAY negative",
-      "category": "x-ray variants",
-      "matched": "XRAY"
-    },
-    {
-      "clean_text": "CT scan reveals nodule",
-      "category": "ct-scan variants",
-      "matched": "CT scan"
-    },
-    {
-      "clean_text": "CT-scan indicates mass",
-      "category": "ct-scan variants",
-      "matched": "CT-scan"
-    },
-    {
-      "clean_text": "Requires ctscan urgently",
-      "category": "ct-scan variants",
-      "matched": "ctscan"
-    },
-    {
-      "clean_text": "CTSCAN of abdomen",
-      "category": "ct-scan variants",
-      "matched": "CTSCAN"
-    },
-    {
-      "clean_text": "Point-of-care testing needed",
-      "category": "point-of-care variants",
-      "matched": "Point-of-care"
-    },
-    {
-      "clean_text": "Point of care ultrasound",
-      "category": "point-of-care variants",
-      "matched": "Point of care"
-    },
-    {
-      "clean_text": "POC testing results",
-      "category": "point-of-care variants",
-      "matched": ""
-    },
-    {
-      "clean_text": "Ordered both x-ray and CT scan",
-      "category": "mixed cases",
-      "matched": "x-ray|CT scan"
-    },
-    {
-      "clean_text": "XRAY and CTSCAN negative",
-      "category": "mixed cases",
-      "matched": "XRAY|CTSCAN"
-    },
-    {
-      "clean_text": "Multiple point-of-care tests with x-ray",
-      "category": "mixed cases",
-      "matched": "point-of-care|x-ray"
-    },
-    {
-      "clean_text": "No imaging mentioned",
-      "category": "negative cases",
-      "matched": ""
-    },
-    {
-      "clean_text": "Regular examination only",
-      "category": "negative cases",
-      "matched": ""
-    },
-    {
-      "clean_text": "Laboratory tests pending",
-      "category": "negative cases",
-      "matched": ""
-    }
-  ],
-  "basic_matching": [
-    {
-      "clean_text": "Emergency treatment required",
-      "category": "simple matches",
-      "matched": "Emergency"
-    },
-    {
-      "clean_text": "Acute condition observed",
-      "category": "simple matches",
-      "matched": "Acute"
-    },
-    {
-      "clean_text": "Urgent care needed",
-      "category": "simple matches",
-      "matched": "Urgent"
-    },
-    {
-      "clean_text": "EMERGENCY situation",
-      "category": "case variations",
-      "matched": "EMERGENCY"
-    },
-    {
-      "clean_text": "Acute RESPIRATORY failure",
-      "category": "case variations",
-      "matched": "Acute"
-    },
-    {
-      "clean_text": "URgent surgical intervention",
-      "category": "case variations",
-      "matched": "URgent"
-    },
-    {
-      "clean_text": "Emergency treatment for acute condition",
-      "category": "multiple matches",
-      "matched": "Emergency|acute"
-    },
-    {
-      "clean_text": "Urgent care in emergency department",
-      "category": "multiple matches",
-      "matched": "Urgent|emergency"
-    },
-    {
-      "clean_text": "Acute respiratory emergency",
-      "category": "multiple matches",
-      "matched": "Acute|emergency"
-    },
-    {
-      "clean_text": "Non-emergency situation",
-      "category": "partial words",
-      "matched": "emergency"
-    },
-    {
-      "clean_text": "Subacute condition",
-      "category": "partial words",
-      "matched": ""
-    },
-    {
-      "clean_text": "Emergency-related",
-      "category": "partial words",
-      "matched": "Emergency"
-    }
-  ]
-}

dataset/analysis/stats/analysis_stats_emergency_subset.json DELETED Viewed

@@ -1,55 +0,0 @@
-{
-  "basic_statistics": {
-    "total_records": 10282,
-    "avg_length": 25185.078194903715
-  },
-  "keyword_statistics": {
-    "Acute abdomen": 52,
-    "Acute bleeding": 31,
-    "Acute Coronary Syndrome": 345,
-    "Acute Kidney Injury": 202,
-    "Acute pancreatitis": 214,
-    "Acute respiratory distress syndrome": 231,
-    "Acute stroke": 67,
-    "Anaphylaxis": 1016,
-    "Anaphylactic Shock": 153,
-    "Arrhythmia": 1547,
-    "Atrial fibrillation": 771,
-    "Atrial flutter": 146,
-    "Bradycardia": 884,
-    "Cardiac arrest": 614,
-    "Cardiogenic Shock": 196,
-    "Chest pain": 1433,
-    "Dyspnea": 1319,
-    "Fever": 4270,
-    "Gastrointestinal Hemorrhage": 158,
-    "GI bleeding": 105,
-    "Hemorrhage": 1611,
-    "Hemorrhagic stroke": 117,
-    "Hyperthermia": 305,
-    "Hypovolemic Shock": 63,
-    "Hypotension": 1929,
-    "Hypothermia": 356,
-    "Internal bleeding": 70,
-    "Intracranial Hemorrhages": 6,
-    "Ischemic stroke": 224,
-    "Loss of consciousness": 422,
-    "Myocardial Infarction": 1708,
-    "MI": 10183,
-    "Pulmonary Edema": 487,
-    "Pulmonary Embolism": 654,
-    "Respiratory distress": 730,
-    "Respiratory failure": 579,
-    "Sepsis": 1181,
-    "Severe Sepsis": 81,
-    "Septic Shock": 244,
-    "Shock": 1881,
-    "Status Epilepticus": 150,
-    "Syncope": 834,
-    "Tachycardia": 1650,
-    "Tachypnea": 268,
-    "Traumatic Brain Injury": 171,
-    "Ventricular Tachycardia": 491,
-    "Ventricular fibrillation": 295
-  }
-}

dataset/analysis/stats/analysis_stats_emergency_subset_opt.json DELETED Viewed

@@ -1,55 +0,0 @@
-{
-  "basic_statistics": {
-    "total_records": 11914,
-    "avg_length": 23847.07579318449
-  },
-  "keyword_statistics": {
-    "Acute abdomen": 52,
-    "Acute bleeding": 31,
-    "Acute Coronary Syndrome": 351,
-    "Acute Kidney Injury": 202,
-    "Acute pancreatitis": 214,
-    "Acute respiratory distress syndrome": 231,
-    "Acute stroke": 67,
-    "Anaphylaxis": 1016,
-    "Anaphylactic Shock": 153,
-    "Arrhythmia": 1564,
-    "Atrial fibrillation": 771,
-    "Atrial flutter": 146,
-    "Bradycardia": 884,
-    "Cardiac arrest": 614,
-    "Cardiogenic Shock": 196,
-    "Chest pain": 1434,
-    "Dyspnea": 1319,
-    "Fever": 4279,
-    "Gastrointestinal Hemorrhage": 158,
-    "GI bleeding": 105,
-    "Hemorrhage": 1621,
-    "Hemorrhagic stroke": 117,
-    "Hyperthermia": 305,
-    "Hypovolemic Shock": 63,
-    "Hypotension": 1929,
-    "Hypothermia": 356,
-    "Internal bleeding": 70,
-    "Intracranial Hemorrhages": 6,
-    "Ischemic stroke": 225,
-    "Loss of consciousness": 422,
-    "Myocardial Infarction": 1710,
-    "MI": 11773,
-    "Pulmonary Edema": 487,
-    "Pulmonary Embolism": 654,
-    "Respiratory distress": 730,
-    "Respiratory failure": 579,
-    "Sepsis": 1188,
-    "Severe Sepsis": 81,
-    "Septic Shock": 244,
-    "Shock": 1892,
-    "Status Epilepticus": 150,
-    "Syncope": 834,
-    "Tachycardia": 1651,
-    "Tachypnea": 268,
-    "Traumatic Brain Injury": 171,
-    "Ventricular Tachycardia": 492,
-    "Ventricular fibrillation": 295
-  }
-}

dataset/analysis_treatment/stats/treatment_analysis_comprehensive.json DELETED Viewed

@@ -1,293 +0,0 @@
-{
-  "basic_statistics": {
-    "total_records": 9367,
-    "avg_text_length": 27179.22952919825,
-    "emergency_keywords_count": 47,
-    "treatment_keywords_count": 105
-  },
-  "emergency_keyword_stats": {
-    "Acute abdomen": 51,
-    "Acute bleeding": 31,
-    "Acute Coronary Syndrome": 332,
-    "Acute Kidney Injury": 200,
-    "Acute pancreatitis": 202,
-    "Acute respiratory distress syndrome": 225,
-    "Acute stroke": 65,
-    "Anaphylaxis": 1002,
-    "Anaphylactic Shock": 148,
-    "Arrhythmia": 1490,
-    "Atrial fibrillation": 736,
-    "Atrial flutter": 139,
-    "Bradycardia": 845,
-    "Cardiac arrest": 600,
-    "Cardiogenic Shock": 192,
-    "Chest pain": 1408,
-    "Dyspnea": 1296,
-    "Fever": 4008,
-    "Gastrointestinal Hemorrhage": 158,
-    "GI bleeding": 103,
-    "Hemorrhage": 1532,
-    "Hemorrhagic stroke": 109,
-    "Hyperthermia": 283,
-    "Hypovolemic Shock": 61,
-    "Hypotension": 1897,
-    "Hypothermia": 340,
-    "Internal bleeding": 67,
-    "Intracranial Hemorrhages": 5,
-    "Ischemic stroke": 216,
-    "Loss of consciousness": 406,
-    "Myocardial Infarction": 1607,
-    "MI": 9316,
-    "Pulmonary Edema": 471,
-    "Pulmonary Embolism": 624,
-    "Respiratory distress": 713,
-    "Respiratory failure": 554,
-    "Sepsis": 1145,
-    "Severe Sepsis": 81,
-    "Septic Shock": 231,
-    "Shock": 1702,
-    "Status Epilepticus": 149,
-    "Syncope": 806,
-    "Tachycardia": 1576,
-    "Tachypnea": 262,
-    "Traumatic Brain Injury": 151,
-    "Ventricular Tachycardia": 461,
-    "Ventricular fibrillation": 280
-  },
-  "treatment_keyword_stats": {
-    "ACLS": 30,
-    "administer": 3881,
-    "Adrenaline": 135,
-    "Advanced Cardiac Life Support": 34,
-    "Airway Management": 174,
-    "alpha blocker": 35,
-    "Amiodarone": 315,
-    "analgesia": 323,
-    "Anesthesia Procedural": 0,
-    "Anti-Bacterial Agents": 1,
-    "antibiotic": 1922,
-    "arterial line placement": 0,
-    "beta blocker": 297,
-    "Bi-level Positive Airway Pressure": 6,
-    "bipap": 25,
-    "Blood Transfusion": 379,
-    "Bosmin": 0,
-    "Cardiopulmonary Resuscitation": 131,
-    "Cardioversion": 142,
-    "Catheterization Arterial": 0,
-    "Catheterization Central Venous": 0,
-    "central line placement": 6,
-    "compression dressing": 2,
-    "Computed Tomography": 518,
-    "cpap": 84,
-    "cpr": 151,
-    "crystalloids": 45,
-    "ct scan": 1036,
-    "Defibrillation": 96,
-    "Dopamine": 389,
-    "Dosage Forms": 210,
-    "dose": 5344,
-    "Drug Administration Routes": 0,
-    "Drug Therapy": 773,
-    "Epinephrine": 806,
-    "fluid": 2938,
-    "fluid resuscitation": 115,
-    "hemodynamic monitoring": 43,
-    "Hemodynamics": 135,
-    "Hemostasis": 180,
-    "Ibuprofen": 269,
-    "icu transfer": 9,
-    "Insulin": 808,
-    "intervention": 2695,
-    "intubation": 493,
-    "Intratracheal Intubation": 3,
-    "Intravenous Infusion": 576,
-    "iv fluids": 75,
-    "laboratory techniques": 29,
-    "laboratory testing": 296,
-    "levophed": 11,
-    "Lidocaine": 212,
-    "manage": 4416,
-    "management": 4008,
-    "medication": 4698,
-    "midazolam": 204,
-    "monitor": 4521,
-    "monitoring": 3593,
-    "Morphine": 289,
-    "Nebulization": 41,
-    "nitroglycerin": 125,
-    "NTG": 81,
-    "Norepinephrine": 392,
-    "normal saline": 252,
-    "Ondansetron": 43,
-    "Oxygen": 1779,
-    "Oxygen Inhalation Therapy": 2,
-    "oxygen therapy": 178,
-    "Patient Management": 281,
-    "Patient Monitoring": 107,
-    "POCUS": 10,
-    "point of care ultrasound": 2,
-    "procedural sedation": 26,
-    "procedure": 3073,
-    "radiologic imaging": 5,
-    "Radiography": 218,
-    "resuscitation": 539,
-    "Sedation": 602,
-    "splinting": 26,
-    "Splints": 29,
-    "supportive care": 564,
-    "surgical procedures": 482,
-    "Surgical Procedures Operative": 0,
-    "surgery": 3531,
-    "Suture": 179,
-    "Suturing": 53,
-    "Therapeutic Intervention": 181,
-    "Therapeutics": 182,
-    "Therapy": 6117,
-    "tourniquet": 56,
-    "transfusion": 826,
-    "treat": 8270,
-    "treatment": 7719,
-    "Ultrasonography Point of Care": 0,
-    "ultrasound": 1273,
-    "Vasoconstrictor Agents": 2,
-    "vasopressors": 188,
-    "ventilation support": 14,
-    "Ventilators": 86,
-    "Vital Signs": 459,
-    "vital signs monitoring": 1,
-    "wound care": 73,
-    "Wound Dressing": 30,
-    "Wound Management": 37,
-    "X-Ray": 1293
-  },
-  "cooccurrence_analysis": [
-    {
-      "emergency_keyword": "Fever",
-      "treatment_keyword": "treatment",
-      "cooccurrence_count": 3488,
-      "percentage": 37.23710899967973
-    },
-    {
-      "emergency_keyword": "Fever",
-      "treatment_keyword": "Therapy",
-      "cooccurrence_count": 2698,
-      "percentage": 28.803245436105477
-    },
-    {
-      "emergency_keyword": "Fever",
-      "treatment_keyword": "dose",
-      "cooccurrence_count": 2430,
-      "percentage": 25.94213729048788
-    },
-    {
-      "emergency_keyword": "Fever",
-      "treatment_keyword": "medication",
-      "cooccurrence_count": 1979,
-      "percentage": 21.127362015586634
-    },
-    {
-      "emergency_keyword": "Hypotension",
-      "treatment_keyword": "treatment",
-      "cooccurrence_count": 1760,
-      "percentage": 18.789366926443897
-    },
-    {
-      "emergency_keyword": "Fever",
-      "treatment_keyword": "management",
-      "cooccurrence_count": 1753,
-      "percentage": 18.714636489804633
-    },
-    {
-      "emergency_keyword": "Fever",
-      "treatment_keyword": "treat",
-      "cooccurrence_count": 1744,
-      "percentage": 18.618554499839863
-    },
-    {
-      "emergency_keyword": "Fever",
-      "treatment_keyword": "monitoring",
-      "cooccurrence_count": 1674,
-      "percentage": 17.87125013344721
-    },
-    {
-      "emergency_keyword": "Hypotension",
-      "treatment_keyword": "Therapy",
-      "cooccurrence_count": 1558,
-      "percentage": 16.63286004056795
-    },
-    {
-      "emergency_keyword": "Fever",
-      "treatment_keyword": "surgery",
-      "cooccurrence_count": 1505,
-      "percentage": 16.06704387744208
-    },
-    {
-      "emergency_keyword": "Tachycardia",
-      "treatment_keyword": "treatment",
-      "cooccurrence_count": 1441,
-      "percentage": 15.383794171025942
-    },
-    {
-      "emergency_keyword": "Hypotension",
-      "treatment_keyword": "dose",
-      "cooccurrence_count": 1423,
-      "percentage": 15.191630191096403
-    },
-    {
-      "emergency_keyword": "Myocardial Infarction",
-      "treatment_keyword": "treatment",
-      "cooccurrence_count": 1369,
-      "percentage": 14.615138251307783
-    },
-    {
-      "emergency_keyword": "Shock",
-      "treatment_keyword": "treatment",
-      "cooccurrence_count": 1340,
-      "percentage": 14.305540728087967
-    },
-    {
-      "emergency_keyword": "Fever",
-      "treatment_keyword": "fluid",
-      "cooccurrence_count": 1330,
-      "percentage": 14.198782961460447
-    },
-    {
-      "emergency_keyword": "Hemorrhage",
-      "treatment_keyword": "treatment",
-      "cooccurrence_count": 1328,
-      "percentage": 14.177431408134941
-    },
-    {
-      "emergency_keyword": "Hypotension",
-      "treatment_keyword": "monitoring",
-      "cooccurrence_count": 1325,
-      "percentage": 14.145404078146683
-    },
-    {
-      "emergency_keyword": "Tachycardia",
-      "treatment_keyword": "Therapy",
-      "cooccurrence_count": 1277,
-      "percentage": 13.632966798334579
-    },
-    {
-      "emergency_keyword": "Dyspnea",
-      "treatment_keyword": "treatment",
-      "cooccurrence_count": 1228,
-      "percentage": 13.10985374185972
-    },
-    {
-      "emergency_keyword": "Myocardial Infarction",
-      "treatment_keyword": "Therapy",
-      "cooccurrence_count": 1215,
-      "percentage": 12.97106864524394
-    }
-  ],
-  "path_b_validation": {
-    "avg_emergency_density": 0.3098621434407273,
-    "avg_treatment_density": 0.6108515041451529,
-    "high_density_records": 1298,
-    "precision_estimate": 0.9995729689334899
-  },
-  "condition_mapping_candidates": {}
-}

dataset/scripts/data_explorer_treatment.py CHANGED Viewed

@@ -2,7 +2,7 @@
 import pandas as pd
 import matplotlib.pyplot as plt
 import seaborn as sns
-import numpy as np
 from pathlib import Path
 import json
 from tqdm import tqdm

 import pandas as pd
 import matplotlib.pyplot as plt
 import seaborn as sns
+# Removed duplicate import of numpy
 from pathlib import Path
 import json
 from tqdm import tqdm

requirements.txt CHANGED Viewed

@@ -64,6 +64,7 @@ safehttpx==0.1.6
 safetensors==0.5.3
 seaborn==0.13.2
 semantic-version==2.10.0
 shellingham==1.5.4
 six==1.17.0
 sniffio==1.3.1

 safetensors==0.5.3
 seaborn==0.13.2
 semantic-version==2.10.0
+sentence-transformers==3.0.1
 shellingham==1.5.4
 six==1.17.0
 sniffio==1.3.1

src/__init__.py ADDED Viewed

	@@ -0,0 +1,8 @@

+"""
+OnCall.ai src package
+This package contains the core implementation of the OnCall.ai system.
+"""
+# Version
+__version__ = '0.1.0'

src/data_processing.py ADDED Viewed

	@@ -0,0 +1,698 @@

+"""
+OnCall.ai Data Processing Module
+This module handles:
+1. Loading filtered medical guideline data
+2. Creating intelligent chunks based on matched keywords
+3. Generating embeddings using NeuML/pubmedbert-base-embeddings
+4. Building ANNOY indices for vector search
+5. Data quality validation
+Author: OnCall.ai Team
+Date: 2025-07-26
+"""
+# Required imports for core functionality
+import json
+import pandas as pd
+import numpy as np
+from pathlib import Path
+from typing import List, Dict, Tuple, Any
+from sentence_transformers import SentenceTransformer
+from annoy import AnnoyIndex
+import logging
+from tqdm import tqdm
+# Setup logging
+logging.basicConfig(
+    level=logging.INFO,  # change between INFO and DEBUG level
+    format='%(levelname)s:%(name)s:%(message)s'
+)
+logger = logging.getLogger(__name__)
+# Explicitly define what should be exported
+__all__ = ['DataProcessor']
+class DataProcessor:
+    """Main data processing class for OnCall.ai RAG system"""
+    def __init__(self, base_dir: str = None):
+        """
+        Initialize DataProcessor
+        Args:
+            base_dir: Base directory path for the project
+        """
+        self.base_dir = Path(base_dir).resolve() if base_dir else Path(__file__).parent.parent.resolve()
+        self.dataset_dir = (self.base_dir / "dataset" / "dataset").resolve()  # modify to actual dataset directory
+        self.models_dir = (self.base_dir / "models").resolve()
+        # Model configuration
+        self.embedding_model_name = "NeuML/pubmedbert-base-embeddings"
+        self.embedding_dim = 768  # PubMedBERT dimension
+        self.chunk_size = 256    # Changed to tokens instead of characters
+        self.chunk_overlap = 64  # Added overlap configuration
+        # Initialize model and tokenizer (will be loaded when needed)
+        self.embedding_model = None
+        self.tokenizer = None
+        # Data containers
+        self.emergency_data = None
+        self.treatment_data = None
+        self.emergency_chunks = []
+        self.treatment_chunks = []
+        # Initialize indices
+        self.emergency_index = None
+        self.treatment_index = None
+        logger.info(f"Initialized DataProcessor with:")
+        logger.info(f"  Base directory: {self.base_dir}")
+        logger.info(f"  Dataset directory: {self.dataset_dir}")
+        logger.info(f"  Models directory: {self.models_dir}")
+        logger.info(f"  Chunk size (tokens): {self.chunk_size}")
+        logger.info(f"  Chunk overlap (tokens): {self.chunk_overlap}")
+    def load_embedding_model(self):
+        """Load the embedding model and initialize tokenizer"""
+        if self.embedding_model is None:
+            logger.info(f"Loading embedding model: {self.embedding_model_name}")
+            self.embedding_model = SentenceTransformer(self.embedding_model_name)
+            self.tokenizer = self.embedding_model.tokenizer
+            logger.info("Embedding model and tokenizer loaded successfully")
+        return self.embedding_model
+    def load_filtered_data(self) -> Tuple[pd.DataFrame, pd.DataFrame]:
+        """
+        Load pre-filtered emergency and treatment data
+        Returns:
+            Tuple of (emergency_data, treatment_data) DataFrames
+        """
+        logger.info("Loading filtered medical data...")
+        # File paths
+        emergency_path = (self.dataset_dir / "emergency" / "emergency_subset_opt.jsonl").resolve()
+        treatment_path = (self.dataset_dir / "emergency_treatment" / "emergency_treatment_subset_opt.jsonl").resolve()
+        logger.info(f"Looking for emergency data at: {emergency_path}")
+        logger.info(f"Looking for treatment data at: {treatment_path}")
+        # Validate file existence
+        if not emergency_path.exists():
+            raise FileNotFoundError(f"Emergency data not found: {emergency_path}")
+        if not treatment_path.exists():
+            raise FileNotFoundError(f"Treatment data not found: {treatment_path}")
+        # Load data
+        self.emergency_data = pd.read_json(str(emergency_path), lines=True)  # 使用 str() 确保路径正确处理
+        self.treatment_data = pd.read_json(str(treatment_path), lines=True)
+        logger.info(f"Loaded {len(self.emergency_data)} emergency records")
+        logger.info(f"Loaded {len(self.treatment_data)} treatment records")
+        return self.emergency_data, self.treatment_data
+    def create_keyword_centered_chunks(self, text: str, matched_keywords: str,
+                                     chunk_size: int = None, doc_id: str = None) -> List[Dict[str, Any]]:
+        """
+        Create chunks centered around matched keywords using tokenizer
+        Args:
+            text: Input text
+            matched_keywords: Pipe-separated keywords (e.g., "MI|chest pain|fever")
+            chunk_size: Size of each chunk in tokens (defaults to self.chunk_size)
+            doc_id: Document ID for tracking
+        Returns:
+            List of chunk dictionaries
+        """
+        if not matched_keywords or pd.isna(matched_keywords):
+            return []
+        # Load model if not loaded (to get tokenizer)
+        if self.tokenizer is None:
+            self.load_embedding_model()
+        # Convert text and keywords to lowercase at the start
+        text = text.lower()
+        keywords = [kw.lower() for kw in matched_keywords.split("|")] if matched_keywords else []
+        chunk_size = chunk_size or self.chunk_size
+        chunks = []
+        # Calculate character-to-token ratio using a sample around the first keyword
+        if keywords:
+            first_keyword = keywords[0]
+            first_pos = text.find(first_keyword)
+            if first_pos != -1:
+                # Take a sample around the first keyword for ratio calculation
+                sample_start = max(0, first_pos - 100)
+                sample_end = min(len(text), first_pos + len(first_keyword) + 100)
+                sample_text = text[sample_start:sample_end]
+                sample_tokens = len(self.tokenizer.tokenize(sample_text))
+                chars_per_token = len(sample_text) / sample_tokens if sample_tokens > 0 else 4.0
+            else:
+                chars_per_token = 4.0  # Fallback ratio
+        else:
+            chars_per_token = 4.0  # Default ratio
+        # Process keywords
+        for i, keyword in enumerate(keywords):
+            # Find keyword position in text (already lowercase)
+            keyword_pos = text.find(keyword)
+            if keyword_pos != -1:
+                # Get the keyword text (already lowercase)
+                actual_keyword = text[keyword_pos:keyword_pos + len(keyword)]
+                # Calculate rough window size using dynamic ratio
+                # Cap the rough chunk target token size to prevent tokenizer warnings
+                # Use 512 tokens as target (model's max limit)
+                ROUGH_CHUNK_TARGET_TOKENS = 512
+                char_window = int(ROUGH_CHUNK_TARGET_TOKENS * chars_per_token / 2)
+                # Get rough chunk boundaries in characters
+                rough_start = max(0, keyword_pos - char_window)
+                rough_end = min(len(text), keyword_pos + len(keyword) + char_window)
+                # Extract rough chunk for processing
+                rough_chunk = text[rough_start:rough_end]
+                # Find keyword's relative position in rough chunk
+                rel_pos = rough_chunk.find(actual_keyword)
+                if rel_pos == -1:
+                    logger.debug(f"Could not locate keyword '{actual_keyword}' in rough chunk for doc {doc_id}")
+                    continue
+                # Calculate token position by tokenizing text before keyword
+                text_before = rough_chunk[:rel_pos]
+                tokens_before = self.tokenizer.tokenize(text_before)
+                keyword_start_pos = len(tokens_before)
+                # Tokenize necessary parts
+                chunk_tokens = self.tokenizer.tokenize(rough_chunk)
+                keyword_tokens = self.tokenizer.tokenize(actual_keyword)
+                keyword_length = len(keyword_tokens)
+                # Calculate final chunk boundaries in tokens
+                tokens_each_side = (chunk_size - keyword_length) // 2
+                chunk_start = max(0, keyword_start_pos - tokens_each_side)
+                chunk_end = min(len(chunk_tokens), keyword_start_pos + keyword_length + tokens_each_side)
+                # Add overlap if possible
+                if chunk_start > 0:
+                    chunk_start = max(0, chunk_start - self.chunk_overlap)
+                if chunk_end < len(chunk_tokens):
+                    chunk_end = min(len(chunk_tokens), chunk_end + self.chunk_overlap)
+                # Extract final tokens and convert to text
+                final_tokens = chunk_tokens[chunk_start:chunk_end]
+                chunk_text = self.tokenizer.convert_tokens_to_string(final_tokens)
+                # Verify keyword presence in final chunk
+                if chunk_text and actual_keyword in chunk_text:
+                    chunk_info = {
+                        "text": chunk_text,
+                        "primary_keyword": actual_keyword,
+                        "all_matched_keywords": matched_keywords.lower(),
+                        "token_count": len(final_tokens),
+                        "chunk_id": f"{doc_id}_chunk_{i}" if doc_id else f"chunk_{i}",
+                        "source_doc_id": doc_id
+                    }
+                    chunks.append(chunk_info)
+                else:
+                    logger.debug(f"Could not create chunk for keyword '{actual_keyword}' in doc {doc_id}")
+        if chunks:
+            logger.debug(f"Created {len(chunks)} chunks for document {doc_id or 'unknown'}")
+        return chunks
+    def create_dual_keyword_chunks(self, text: str, emergency_keywords: str,
+                                 treatment_keywords: str, chunk_size: int = 512,
+                                 doc_id: str = None) -> List[Dict[str, Any]]:
+        """
+        Create chunks for treatment data with both emergency and treatment keywords
+        Args:
+            text: Input text
+            emergency_keywords: Emergency keywords
+            treatment_keywords: Treatment keywords
+            chunk_size: Size of each chunk
+            doc_id: Document ID for tracking
+        Returns:
+            List of chunk dictionaries
+        """
+        if not treatment_keywords or pd.isna(treatment_keywords):
+            return []
+        chunks = []
+        em_keywords = emergency_keywords.split("|") if emergency_keywords else []
+        tr_keywords = treatment_keywords.split("|") if treatment_keywords else []
+        # Process treatment keywords as primary (since this is treatment-focused data)
+        for i, tr_keyword in enumerate(tr_keywords):
+            tr_pos = text.lower().find(tr_keyword.lower())
+            if tr_pos != -1:
+                # Find closest emergency keyword for context
+                closest_em_keyword = None
+                closest_distance = float('inf')
+                for em_keyword in em_keywords:
+                    em_pos = text.lower().find(em_keyword.lower())
+                    if em_pos != -1:
+                        distance = abs(tr_pos - em_pos)
+                        if distance < closest_distance and distance < chunk_size:
+                            closest_distance = distance
+                            closest_em_keyword = em_keyword
+                # Calculate chunk boundaries
+                if closest_em_keyword:
+                    # Center between both keywords
+                    em_pos = text.lower().find(closest_em_keyword.lower())
+                    center = (tr_pos + em_pos) // 2
+                else:
+                    # Center on treatment keyword
+                    center = tr_pos
+                start = max(0, center - chunk_size // 2)
+                end = min(len(text), center + chunk_size // 2)
+                chunk_text = text[start:end].strip()
+                if chunk_text:
+                    chunk_info = {
+                        "text": chunk_text,
+                        "primary_keyword": tr_keyword,
+                        "emergency_keywords": emergency_keywords,
+                        "treatment_keywords": treatment_keywords,
+                        "closest_emergency_keyword": closest_em_keyword,
+                        "keyword_distance": closest_distance if closest_em_keyword else None,
+                        "chunk_start": start,
+                        "chunk_end": end,
+                        "chunk_id": f"{doc_id}_treatment_chunk_{i}" if doc_id else f"treatment_chunk_{i}",
+                        "source_doc_id": doc_id
+                    }
+                    chunks.append(chunk_info)
+        return chunks
+    def process_emergency_chunks(self) -> List[Dict[str, Any]]:
+        """Process emergency data into chunks"""
+        if self.emergency_data is None:
+            raise ValueError("Emergency data not loaded. Call load_filtered_data() first.")
+        all_chunks = []
+        # Add progress bar with leave=False to avoid cluttering
+        for idx, row in tqdm(self.emergency_data.iterrows(),
+                        total=len(self.emergency_data),
+                        desc="Processing emergency documents",
+                        unit="doc",
+                        leave=False):
+            if pd.notna(row.get('clean_text')) and pd.notna(row.get('matched')):
+                chunks = self.create_keyword_centered_chunks(
+                    text=row['clean_text'],
+                    matched_keywords=row['matched'],
+                    chunk_size=self.chunk_size,
+                    doc_id=str(row.get('id', idx))
+                )
+                # Add metadata to each chunk
+                for chunk in chunks:
+                    chunk.update({
+                        'source_type': 'emergency',
+                        'source_title': row.get('title', ''),
+                        'source_url': row.get('url', ''),
+                        'has_emergency': row.get('has_emergency', True),
+                        'doc_type': row.get('type', 'emergency')
+                    })
+                all_chunks.extend(chunks)
+        self.emergency_chunks = all_chunks
+        logger.info(f"Completed processing emergency data: {len(all_chunks)} chunks generated")
+        return all_chunks
+    def process_treatment_chunks(self) -> List[Dict[str, Any]]:
+        """Process treatment data into chunks"""
+        if self.treatment_data is None:
+            raise ValueError("Treatment data not loaded. Call load_filtered_data() first.")
+        all_chunks = []
+        # Add progress bar with leave=False to avoid cluttering
+        for idx, row in tqdm(self.treatment_data.iterrows(),
+                        total=len(self.treatment_data),
+                        desc="Processing treatment documents",
+                        unit="doc",
+                        leave=False):
+            if (pd.notna(row.get('clean_text')) and
+                pd.notna(row.get('treatment_matched'))):
+                chunks = self.create_dual_keyword_chunks(
+                    text=row['clean_text'],
+                    emergency_keywords=row.get('matched', ''),
+                    treatment_keywords=row['treatment_matched'],
+                    chunk_size=self.chunk_size,
+                    doc_id=str(row.get('id', idx))
+                )
+                # Add metadata to each chunk
+                for chunk in chunks:
+                    chunk.update({
+                        'source_type': 'treatment',
+                        'source_title': row.get('title', ''),
+                        'source_url': row.get('url', ''),
+                        'has_emergency': row.get('has_emergency', True),
+                        'has_treatment': row.get('has_treatment', True),
+                        'doc_type': row.get('type', 'treatment')
+                    })
+                all_chunks.extend(chunks)
+        self.treatment_chunks = all_chunks
+        logger.info(f"Completed processing treatment data: {len(all_chunks)} chunks generated")
+        return all_chunks
+    def _get_chunk_hash(self, text: str) -> str:
+        """Generate hash for chunk text to use as cache key"""
+        import hashlib
+        return hashlib.md5(text.encode('utf-8')).hexdigest()
+    def _load_embedding_cache(self, cache_file: str) -> dict:
+        """Load embedding cache from file"""
+        import pickle
+        import os
+        if os.path.exists(cache_file):
+            try:
+                with open(cache_file, 'rb') as f:
+                    return pickle.load(f)
+            except:
+                logger.warning(f"Could not load cache file {cache_file}, starting fresh")
+                return {}
+        return {}
+    def _save_embedding_cache(self, cache: dict, cache_file: str):
+        """Save embedding cache to file"""
+        import pickle
+        import os
+        os.makedirs(os.path.dirname(cache_file), exist_ok=True)
+        with open(cache_file, 'wb') as f:
+            pickle.dump(cache, f)
+    def generate_embeddings(self, chunks: List[Dict[str, Any]],
+                          chunk_type: str = "emergency") -> np.ndarray:
+        """
+        Generate embeddings for chunks with caching support
+        Args:
+            chunks: List of chunk dictionaries
+            chunk_type: Type of chunks ("emergency" or "treatment")
+        Returns:
+            numpy array of embeddings
+        """
+        logger.info(f"Starting embedding generation for {len(chunks)} {chunk_type} chunks...")
+        # Cache setup
+        cache_dir = self.models_dir / "cache"
+        cache_dir.mkdir(parents=True, exist_ok=True)
+        cache_file = cache_dir / f"{chunk_type}_embeddings_cache.pkl"
+        # Load existing cache
+        cache = self._load_embedding_cache(str(cache_file))
+        cached_embeddings = []
+        to_embed = []
+        # Check cache for each chunk
+        for i, chunk in enumerate(chunks):
+            chunk_hash = self._get_chunk_hash(chunk['text'])
+            if chunk_hash in cache:
+                cached_embeddings.append((i, cache[chunk_hash]))
+            else:
+                to_embed.append((i, chunk_hash, chunk['text']))
+        logger.info(f"Cache status: {len(cached_embeddings)} cached, {len(to_embed)} new chunks to embed")
+        # Generate embeddings for new chunks
+        new_embeddings = []
+        if to_embed:
+            # Load model
+            model = self.load_embedding_model()
+            texts = [text for _, _, text in to_embed]
+            # Generate embeddings in batches with clear progress
+            batch_size = 32
+            total_batches = (len(texts) + batch_size - 1) // batch_size
+            logger.info(f"Processing {len(texts)} new {chunk_type} texts in {total_batches} batches...")
+            for i in tqdm(range(0, len(texts), batch_size),
+                         desc=f"Embedding {chunk_type} subset",
+                         total=total_batches,
+                         unit="batch",
+                         leave=False):
+                batch_texts = texts[i:i + batch_size]
+                batch_emb = model.encode(
+                    batch_texts,
+                    show_progress_bar=False
+                )
+                new_embeddings.extend(batch_emb)
+            # Update cache with new embeddings
+            for (_, chunk_hash, _), emb in zip(to_embed, new_embeddings):
+                cache[chunk_hash] = emb
+            # Save updated cache
+            self._save_embedding_cache(cache, str(cache_file))
+            logger.info(f"Updated cache with {len(new_embeddings)} new embeddings")
+        # Combine cached and new embeddings in correct order
+        all_embeddings = [None] * len(chunks)
+        # Place cached embeddings
+        for idx, emb in cached_embeddings:
+            all_embeddings[idx] = emb
+        # Place new embeddings
+        for (idx, _, _), emb in zip(to_embed, new_embeddings):
+            all_embeddings[idx] = emb
+        # Convert to numpy array
+        result = np.vstack(all_embeddings)
+        logger.info(f"Completed embedding generation: shape {result.shape}")
+        return result
+    def build_annoy_index(self, embeddings: np.ndarray,
+                         index_name: str, n_trees: int = 15) -> AnnoyIndex:
+        """
+        Build ANNOY index from embeddings
+        Args:
+            embeddings: Numpy array of embeddings
+            index_name: Name for the index file
+            n_trees: Number of trees for ANNOY index
+        Returns:
+            Built ANNOY index
+        """
+        logger.info(f"Building ANNOY index: {index_name}")
+        # Create ANNOY index
+        index = AnnoyIndex(self.embedding_dim, 'angular')  # angular = cosine similarity
+        # Add vectors to index
+        for i, embedding in enumerate(embeddings):
+            index.add_item(i, embedding)
+        # Build index
+        index.build(n_trees)
+        # Save index
+        index_path = self.models_dir / "indices" / "annoy" / f"{index_name}.ann"
+        index_path.parent.mkdir(parents=True, exist_ok=True)
+        index.save(str(index_path))
+        logger.info(f"ANNOY index saved to: {index_path}")
+        return index
+    def save_chunks_and_embeddings(self, chunks: List[Dict[str, Any]],
+                                 embeddings: np.ndarray, chunk_type: str):
+        """
+        Save chunks metadata and embeddings
+        Args:
+            chunks: List of chunk dictionaries
+            embeddings: Numpy array of embeddings
+            chunk_type: Type of chunks ("emergency" or "treatment")
+        """
+        logger.info(f"Saving {chunk_type} chunks and embeddings...")
+        # Create output directories
+        embeddings_dir = self.models_dir / "embeddings"
+        embeddings_dir.mkdir(parents=True, exist_ok=True)
+        # Save chunks metadata
+        chunks_file = embeddings_dir / f"{chunk_type}_chunks.json"
+        with open(chunks_file, 'w', encoding='utf-8') as f:
+            json.dump(chunks, f, ensure_ascii=False, indent=2)
+        # Save embeddings
+        embeddings_file = embeddings_dir / f"{chunk_type}_embeddings.npy"
+        np.save(embeddings_file, embeddings)
+        logger.info(f"Saved {chunk_type} data:")
+        logger.info(f"  - Chunks: {chunks_file}")
+        logger.info(f"  - Embeddings: {embeddings_file}")
+    def validate_data_quality(self) -> Dict[str, Any]:
+        """
+        Validate data quality and return statistics
+        Returns:
+            Dictionary with validation statistics
+        """
+        logger.info("Validating data quality...")
+        validation_report = {
+            "emergency_data": {},
+            "treatment_data": {},
+            "chunks": {},
+            "embeddings": {}
+        }
+        # Emergency data validation
+        if self.emergency_data is not None:
+            validation_report["emergency_data"] = {
+                "total_records": len(self.emergency_data),
+                "records_with_text": self.emergency_data['clean_text'].notna().sum(),
+                "records_with_keywords": self.emergency_data['matched'].notna().sum(),
+                "avg_text_length": self.emergency_data['clean_text'].str.len().mean()
+            }
+        # Treatment data validation
+        if self.treatment_data is not None:
+            validation_report["treatment_data"] = {
+                "total_records": len(self.treatment_data),
+                "records_with_text": self.treatment_data['clean_text'].notna().sum(),
+                "records_with_emergency_keywords": self.treatment_data['matched'].notna().sum(),
+                "records_with_treatment_keywords": self.treatment_data['treatment_matched'].notna().sum(),
+                "avg_text_length": self.treatment_data['clean_text'].str.len().mean()
+            }
+        # Chunks validation
+        validation_report["chunks"] = {
+            "emergency_chunks": len(self.emergency_chunks),
+            "treatment_chunks": len(self.treatment_chunks),
+            "total_chunks": len(self.emergency_chunks) + len(self.treatment_chunks)
+        }
+        if self.emergency_chunks:
+            avg_chunk_length = np.mean([len(chunk['text']) for chunk in self.emergency_chunks])
+            validation_report["chunks"]["avg_emergency_chunk_length"] = avg_chunk_length
+        if self.treatment_chunks:
+            avg_chunk_length = np.mean([len(chunk['text']) for chunk in self.treatment_chunks])
+            validation_report["chunks"]["avg_treatment_chunk_length"] = avg_chunk_length
+        # Check if embeddings exist
+        embeddings_dir = self.models_dir / "embeddings"
+        if embeddings_dir.exists():
+            emergency_emb_file = embeddings_dir / "emergency_embeddings.npy"
+            treatment_emb_file = embeddings_dir / "treatment_embeddings.npy"
+            validation_report["embeddings"] = {
+                "emergency_embeddings_exist": emergency_emb_file.exists(),
+                "treatment_embeddings_exist": treatment_emb_file.exists()
+            }
+            if emergency_emb_file.exists():
+                emb = np.load(emergency_emb_file)
+                validation_report["embeddings"]["emergency_embeddings_shape"] = emb.shape
+            if treatment_emb_file.exists():
+                emb = np.load(treatment_emb_file)
+                validation_report["embeddings"]["treatment_embeddings_shape"] = emb.shape
+        # Save validation report
+        report_file = self.models_dir / "data_validation_report.json"
+        with open(report_file, 'w', encoding='utf-8') as f:
+            json.dump(validation_report, f, indent=2, default=str)
+        logger.info(f"Validation report saved to: {report_file}")
+        return validation_report
+    def process_all_data(self) -> Dict[str, Any]:
+        """
+        Complete data processing pipeline
+        Returns:
+            Processing summary
+        """
+        logger.info("Starting complete data processing pipeline...")
+        # Step 1: Load filtered data
+        self.load_filtered_data()
+        # Step 2: Process chunks
+        emergency_chunks = self.process_emergency_chunks()
+        treatment_chunks = self.process_treatment_chunks()
+        # Step 3: Generate embeddings
+        emergency_embeddings = self.generate_embeddings(emergency_chunks, "emergency")
+        treatment_embeddings = self.generate_embeddings(treatment_chunks, "treatment")
+        # Step 4: Build ANNOY indices
+        self.emergency_index = self.build_annoy_index(emergency_embeddings, "emergency_index")
+        self.treatment_index = self.build_annoy_index(treatment_embeddings, "treatment_index")
+        # Step 5: Save data
+        self.save_chunks_and_embeddings(emergency_chunks, emergency_embeddings, "emergency")
+        self.save_chunks_and_embeddings(treatment_chunks, treatment_embeddings, "treatment")
+        # Step 6: Validate data quality
+        validation_report = self.validate_data_quality()
+        # Summary
+        summary = {
+            "status": "completed",
+            "emergency_chunks": len(emergency_chunks),
+            "treatment_chunks": len(treatment_chunks),
+            "emergency_embeddings_shape": emergency_embeddings.shape,
+            "treatment_embeddings_shape": treatment_embeddings.shape,
+            "indices_created": ["emergency_index.ann", "treatment_index.ann"],
+            "validation_report": validation_report
+        }
+        logger.info("Data processing pipeline completed successfully!")
+        logger.info(f"Summary: {summary}")
+        return summary
+def main():
+    """Main function for testing the data processor"""
+    # Initialize processor
+    processor = DataProcessor()
+    # Run complete pipeline
+    summary = processor.process_all_data()
+    print("\n" + "="*50)
+    print("DATA PROCESSING COMPLETED")
+    print("="*50)
+    print(f"Emergency chunks: {summary['emergency_chunks']}")
+    print(f"Treatment chunks: {summary['treatment_chunks']}")
+    print(f"Emergency embeddings: {summary['emergency_embeddings_shape']}")
+    print(f"Treatment embeddings: {summary['treatment_embeddings_shape']}")
+    print(f"Indices created: {summary['indices_created']}")
+    print("="*50)
+if __name__ == "__main__":
+    main()

tests/embedding_test_analysis.md ADDED Viewed

	@@ -0,0 +1,355 @@

+# Embedding Test Analysis Report
+## 1. Dataset Overview
+### 1.1 Data Dimensions
+- Emergency Dataset: 27,493 chunks × 768 dimensions
+- Treatment Dataset: 82,378 chunks × 768 dimensions
+- Total Chunks: 109,871
+### 1.2 Embedding Statistics
+**Emergency Embeddings:**
+- Value Range: -3.246 to 3.480
+- Mean: -0.017
+- Standard Deviation: 0.462
+**Treatment Embeddings:**
+- Value Range: -3.686 to 3.505
+- Mean: -0.017
+- Standard Deviation: 0.472
+**Analysis:**
+- Both datasets have similar statistical properties
+- Mean values are centered around zero (-0.017)
+- Standard deviations are comparable (0.462 vs 0.472)
+- Treatment dataset has slightly wider range (-3.686 to 3.505 vs -3.246 to 3.480)
+## 2. Model Performance
+### 2.1 Self-Retrieval Test
+- Test Size: 20 random samples
+- Success Rate: 19/20 (95%)
+- Failed Case: Index 27418
+- Average Response Time: ~5ms per search
+**Observations:**
+- High success rate in self-retrieval (95%)
+- One failure case needs investigation
+- Search operations are consistently fast
+### 2.2 Cross-Dataset Search Performance
+**Test Queries:**
+1. "What is the treatment protocol for acute myocardial infarction?"
+2. "How to manage severe chest pain with difficulty breathing?"
+3. "What are the emergency procedures for anaphylactic shock?"
+**Key Findings:**
+- Each query returns top-5 results from both datasets
+- Results show semantic understanding (not just keyword matching)
+- First sentences provide good context for relevance assessment
+## 3. System Performance
+### 3.1 Response Times
+- Model Loading: ~3 seconds
+- Embedding Validation: ~0.5 seconds
+- Search Operations: 0.1-0.2 seconds per query
+### 3.2 Resource Usage
+- Model loaded on MPS (Metal Performance Shaders)
+- Efficient memory usage for large datasets
+- Fast vector operations
+## 4. Recommendations
+### 4.1 Immediate Improvements
+1. Investigate failed self-retrieval case (index 27418)
+2. Consider caching frequently accessed embeddings
+3. Add more diverse test queries
+### 4.2 Future Enhancements
+1. Implement hybrid search (combine with BM25)
+2. Add relevance scoring mechanism
+3. Consider domain-specific test cases
+## 5. Log Analysis
+### 5.1 Log Structure
+```
+timestamp - level - message
+```
+### 5.2 Log Levels Used
+- DEBUG: Detailed operation info
+- INFO: General progress and results
+- WARNING: Non-critical issues
+- ERROR: Critical failures
+### 5.3 Key Log Categories
+1. **Initialization Logs:**
+   - Path configurations
+   - Model loading
+   - Dataset loading
+2. **Performance Logs:**
+   - Search operations
+   - Response times
+   - Success/failure counts
+3. **Error Logs:**
+   - Failed searches
+   - Validation errors
+   - Connection issues
+### 5.4 Notable Log Patterns
+- Regular HTTPS connections to HuggingFace
+- Consistent search operation timing
+- Clear error messages for failures
+<!-- split -->
+# 🧪 Embedding Test Analysis Report | 向量嵌入測試分析報告
+## 1. Dataset Overview | 資料集總覽
+### 1.1 Data Dimensions | 資料維度
+- **Emergency Dataset**: 27,493 chunks × 768 dimensions
+- **Treatment Dataset**: 82,378 chunks × 768 dimensions
+- **Total Chunks**: 109,871
+### 1.2 Embedding Statistics | 向量統計
+**Emergency Embeddings 緊急資料集嵌入向量:**
+- Value Range 範圍: -3.246 ~ 3.480
+- Mean 平均值: -0.017
+- Std 標準差: 0.462
+**Treatment Embeddings 治療資料集嵌入向量:**
+- Value Range 範圍: -3.686 ~ 3.505
+- Mean 平均值: -0.017
+- Std 標準差: 0.472
+**Analysis 分析：**
+- 兩組資料集中向量分布接近，平均值均接近 0
+- Treatment 資料集範圍稍寬，可能涵蓋更廣語意
+---
+## 2. Model Performance | 模型檢索表現
+### 2.1 Self-Retrieval Test | 自我召回測試
+- 測試數量 Test Size: 20
+- 成功率 Success Rate: **95% (19/20)**
+- 失敗案例 Failed Index: `27418`
+- 平均搜尋時間 Avg Search Time: ~5ms
+**Observation 觀察：**
+- 自我召回成功率高，顯示索引構建準確
+- 可進一步針對失敗樣本檢查切 chunk 是否過短
+<!-- Details -->
+# 🔍 Embedding Search Analysis Report (Emergency vs Treatment)
+## 📊 Overall Summary
+| Query                                                   | Emergency Results     | Treatment Results     | Summary Comment                              |
+|---------------------------------------------------------|------------------------|------------------------|-----------------------------------------------|
+| 1️⃣ Treatment for Acute Myocardial Infarction           | ✅ Matched well         | ✅ Highly relevant      | Relevant guidelines retrieved from both sets |
+| 2️⃣ Management of Severe Chest Pain with Dyspnea        | ⚠️ Redundant, not focused | ⚠️ Vague and general    | Lacks actionable steps, contains repetition   |
+| 3️⃣ Emergency Procedures for Anaphylactic Shock         | ⚠️ Off-topic           | ✅ Precise and relevant | Emergency off-topic, but Treatment is strong  |
+---
+## ��� Detailed Query Analysis
+### ✅ 1. `What is the treatment protocol for acute myocardial infarction?`
+#### 📌 Emergency Dataset:
+- `E-2 ~ E-4` mention guidelines, STEMI, PCI.
+- Distances range from `0.833 ~ 0.842` → valid.
+- `E-3` is a long guideline chunk → ideal RAG candidate.
+✅ Conclusion: Emergency subset performs well, keyword chunking effective.
+#### 📌 Treatment Dataset:
+- `T-1` and `T-2` directly address the question with guideline phrases.
+- `distance ~0.813` → strong semantic match.
+- `T-5` is shorter but still contains “AMI”.
+✅ Conclusion: Treatment retrieval is highly effective.
+---
+### ⚠️ 2. `How to manage severe chest pain with difficulty breathing?`
+#### 📌 Emergency Dataset:
+- `E-1 ~ E-3` are identical dyspnea passages; no actionable steps.
+- `E-4 ~ E-5` are general symptom overviews, not acute response protocols.
+⚠️ Issue: Semantic match exists, but lacks procedural content.
+⚠️ Repetition indicates Annoy might be over-focused on a narrow cluster.
+#### 📌 Treatment Dataset:
+- `T-1 ~ T-3` mention dyspnea and chest pain but are mostly patient descriptions.
+- `T-4` hints at emergency care for asthma but still lacks clarity.
+⚠️ Conclusion: This query needs better symptom-action co-occurrence modeling.
+---
+### ⚠️ 3. `What are the emergency procedures for anaphylactic shock?`
+#### 📌 Emergency Dataset:
+- `E-1 ~ E-2`: irrelevant or truncated.
+- `E-3`: mentions management during anesthesia → partial match.
+- `E-4 ~ E-5`: just list multiple shock types; no protocol info.
+❌ Emergency dataset lacks focused content on this topic.
+#### 📌 Treatment Dataset:
+- `T-1`: explicitly lists epinephrine, oxygen, IV fluids, corticosteroids → ✅ ideal
+- `T-2`: confirms emergency drug prep
+- `T-3 ~ T-5`: all recognize anaphylactic shock
+✅ Conclusion: Treatment subset captures this case very accurately.
+---
+## 📏 Distance Threshold Reference
+| Distance Value Range | Interpretation                            |
+|----------------------|--------------------------------------------|
+| `< 0.80`             | Very strong match (almost identical)       |
+| `0.80 ~ 0.86`        | Acceptable semantic match                  |
+| `> 0.90`             | Weak relevance, possibly off-topic chunks  |
+---
+## 🧰 Recommendations Based on Findings
+| Issue Type                     | Suggested Solution                                                       |
+(genAIvenv) yanbochen@YanBos-MacBook-Pro tests % python test_embedding_validation.py
+=== Query: What is the treatment protocol for acute myocardial infarction? ===
+Batches: 100%|██████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  6.65it/s]
+Emergency Dataset Results:
+E-1 (distance: 0.826):
+myocardial infarction, white [ / bib _ ref ].
+E-2 (distance: 0.833):
+the management of acute myocardial infarction : guidelines and audit standards successful management of acute myocardial infarction depends in the first instance on the patient recognising the symptoms and seeking help as quickly as possible.
+E-3 (distance: 0.836):
+sandbox : stemi # 2017 esc guidelines for the management of acute myocardial infarction in patients presenting with st - segment elevation # # changes in recommendations # # what is new in 2017 guidelines on ami - stemi? # # ami - stemi - 2017 new recommendations # acc / aats / aha / ase / asnc / scai / scct / sts 2016 appropriate use criteria for coronary revascularization in patients with acute coronary syndromes # # stemi — immediate revascularization by pci # # stemi — initial treatment by fibrinolytic therapy # # stemi — revascularization of nonculprit artery during the initial hospitalization # 2017 aha / acc clinical performance and quality measures for adults with st - elevation and non – st - elevation myocardial infarction # # revised stemi and nstemi measures # # revised stemi and nstemi measures.
+E-4 (distance: 0.842):
+stemi resident survival guide # overview st elevation myocardial infarction ( stemi ) is a syndrome characterized by the presence of symptoms of myocardial ischemia associated with persistent st elevation on electrocardiogram and elevated cardiac enzymes.
+E-5 (distance: 0.879):
+# pre - discharge care abbreviations : ace : angiotensin converting enzyme ; lvef : left ventricular ejection fraction ; mi : myocardial infarction ; pci : percutaneous coronary intervention ; po : per os ; stemi : st elevation myocardial infarction ; vf : ventricular fibrillation ; vt : ventricular tachycardia # long term management abbreviations : ace : angiotensin converting enzyme ; arb : angiotensin receptor blocker ; mi : myocardial infarction # do ' s - a pre - hospital ecg is recommended.
+Treatment Dataset Results:
+T-1 (distance: 0.813):
+intain the standard of care and timely access of patients with ACS, including acute myocardial infarction (AMI), to reperfusion therapy.
+T-2 (distance: 0.825):
+The Management of Acute Myocardial Infarction: Guidelines and Audit Standards
+Successful management of acute myocardial infarction.
+T-3 (distance: 0.854):
+fined as STEMI, NSTEMI or unstable angina.
+T-4 (distance: 0.869):
+Japan, there are no clear guidelines focusing on procedural aspect of the standardized care.
+T-5 (distance: 0.879):
+ients with acute myocardial infarction (AMI).
+=== Query: How to manage severe chest pain with difficulty breathing? ===
+Batches: 100%|██████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 47.76it/s]
+Emergency Dataset Results:
+E-1 (distance: 0.848):
+shortness of breath resident survival guide # overview dyspnea is a symptom, it must generally be distinguished from signs that clinicians typically invoke as evidence of respiratory distress, such as tachypnea, use of accessory muscles, and intercostal retractions.
+E-2 (distance: 0.849):
+shortness of breath resident survival guide # overview dyspnea is a symptom, it must generally be distinguished from signs that clinicians typically invoke as evidence of respiratory distress, such as tachypnea, use of accessory muscles, and intercostal retractions.
+E-3 (distance: 0.852):
+shortness of breath resident survival guide # overview dyspnea is a symptom, it must generally be distinguished from signs that clinicians typically invoke as evidence of respiratory distress, such as tachypnea, use of accessory muscles, and intercostal retractions.
+E-4 (distance: 0.879):
+sandbox : milan # overview dyspnea is the uncomfortable awareness of one ' s own breathing.
+E-5 (distance: 0.879):
+sandbox : milan # overview dyspnea is the uncomfortable awareness of one ' s own breathing.
+Treatment Dataset Results:
+T-1 (distance: 0.827):
+lly cyanotic and clammy, and may experience dyspnea or chest pain from underperfusion 13 .
+T-2 (distance: 0.868):
+acterized by a worsening of the patient’s respiratory symptoms (baseline dyspnea, cough, and/or sputum production) that is beyond normal day-to-day variations and leads to a change in medication.
+T-3 (distance: 0.872):
+ally cyanotic and clammy, and may experience dyspnea or chest pain from underperfusion 13.
+T-4 (distance: 0.898):
+ce used to test breathing) results show your breathing problems are worsening
+- you need to go to the emergency room for asthma treatment.
+T-5 (distance: 0.898):
+breathlessness in a person in the last days of life.
+=== Query: What are the emergency procedures for anaphylactic shock? ===
+Batches: 100%|██████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 57.16it/s]
+Emergency Dataset Results:
+E-1 (distance: 0.924):
+the other.
+E-2 (distance: 0.943):
+ic defibrillation.
+E-3 (distance: 0.946):
+suspected anaphylactic reactions associated with anaesthesia # # summary ( 1 ) the aagbi has published guidance on management of anaphylaxis during anaesthesia in.
+E-4 (distance: 0.952):
+- gastrointestinal bleeding - perforated peptic ulcer - post - procedural or post - surgical - retroperitoneal hemorrhage - rupture ovarian cyst - trauma - distributive shock - sepsis - toxic shock syndrome - anaphylactic or anaphylactoid reaction - neurogenic shock - adrenal crisis # fire : focused initial rapid evaluation a focused initial rapid evaluation ( fire ) should be performed to identify patients in need of immediate intervention.
+E-5 (distance: 0.954):
+- surgical - retroperitoneal hemorrhage - rupture ovarian cyst - trauma - distributive shock - sepsis - toxic shock syndrome - anaphylactic or anaphylactoid reaction - neurogenic shock - adrenal crisis # fire : focused initial rapid evaluation a focused initial rapid evaluation ( fire ) should be performed to identify patients in need of immediate intervention.
+Treatment Dataset Results:
+T-1 (distance: 0.813):
+ensitivity (anaphylactic) reactions require emergency treatment with epinephrine and other emergency measures, that may include airway management, oxygen, intravenous fluids, antihistamines, corticosteroids, and vasopressors as clinically indicated.
+T-2 (distance: 0.833):
+ave standard emergency treatments for hypersensitivity or anaphylactic reactions readily available in the operating room (e.
+T-3 (distance: 0.838):
+e, or systemic inflammation (anaphylactic shock).
+T-4 (distance: 0.843):
+ED AND APPROPRIATE THERAPY INSTITUTED.
+T-5 (distance: 0.844):
+UED AND APPROPRIATE THERAPY INSTITUTED.

tests/test_data_processing.py ADDED Viewed

	@@ -0,0 +1,228 @@

+"""
+Test script for data_processing.py
+This script tests the basic functionality without running the full pipeline
+to ensure everything is working correctly before proceeding with embedding generation.
+"""
+import sys
+from pathlib import Path
+import pandas as pd
+# Add src to path
+sys.path.append(str(Path(__file__).parent.parent.resolve() / "src"))
+from data_processing import DataProcessor
+import logging
+# Setup logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(levelname)s:%(name)s:%(message)s'
+)
+# Silence urllib3 logging
+logging.getLogger('urllib3').setLevel(logging.WARNING)
+logger = logging.getLogger(__name__)
+def test_data_loading():
+    """Test data loading functionality"""
+    print("="*50)
+    print("TESTING DATA LOADING")
+    print("="*50)
+    try:
+        # Initialize processor with explicit base directory
+        base_dir = Path(__file__).parent.parent.resolve()
+        processor = DataProcessor(base_dir=str(base_dir))
+        # Test data loading
+        emergency_data, treatment_data = processor.load_filtered_data()
+        print(f"✅ Emergency data loaded: {len(emergency_data)} records")
+        print(f"✅ Treatment data loaded: {len(treatment_data)} records")
+        # Check data structure
+        print("\nEmergency data columns:", list(emergency_data.columns))
+        print("Treatment data columns:", list(treatment_data.columns))
+        # Show sample data
+        if len(emergency_data) > 0:
+            print(f"\nSample emergency matched keywords: {emergency_data['matched'].iloc[0]}")
+        if len(treatment_data) > 0:
+            print(f"Sample treatment matched keywords: {treatment_data['treatment_matched'].iloc[0]}")
+        return True
+    except Exception as e:
+        print(f"❌ Data loading failed: {e}")
+        return False
+def test_chunking():
+    """Test chunking functionality"""
+    print("\n" + "="*50)
+    print("TESTING CHUNKING FUNCTIONALITY")
+    print("="*50)
+    try:
+        # Initialize processor
+        processor = DataProcessor()
+        # Load data
+        processor.load_filtered_data()
+        # Test emergency chunking (just first few records)
+        print("Testing emergency chunking...")
+        emergency_chunks = []
+        for idx, row in processor.emergency_data.head(3).iterrows():
+            if pd.notna(row.get('clean_text')) and pd.notna(row.get('matched')):
+                chunks = processor.create_keyword_centered_chunks(
+                    text=row['clean_text'],
+                    matched_keywords=row['matched'],
+                    chunk_size=512,
+                    doc_id=str(row.get('id', idx))
+                )
+                emergency_chunks.extend(chunks)
+        print(f"✅ Generated {len(emergency_chunks)} emergency chunks from 3 records")
+        # Test treatment chunking (just first few records)
+        print("Testing treatment chunking...")
+        treatment_chunks = []
+        for idx, row in processor.treatment_data.head(3).iterrows():
+            if (pd.notna(row.get('clean_text')) and
+                pd.notna(row.get('treatment_matched'))):
+                chunks = processor.create_dual_keyword_chunks(
+                    text=row['clean_text'],
+                    emergency_keywords=row.get('matched', ''),
+                    treatment_keywords=row['treatment_matched'],
+                    chunk_size=512,
+                    doc_id=str(row.get('id', idx))
+                )
+                treatment_chunks.extend(chunks)
+        print(f"✅ Generated {len(treatment_chunks)} treatment chunks from 3 records")
+        # Show sample chunk
+        if emergency_chunks:
+            sample_chunk = emergency_chunks[0]
+            print(f"\nSample emergency chunk:")
+            print(f"  Primary keyword: {sample_chunk['primary_keyword']}")
+            print(f"  Text length: {len(sample_chunk['text'])}")
+            print(f"  Text preview: {sample_chunk['text'][:100]}...")
+        if treatment_chunks:
+            sample_chunk = treatment_chunks[0]
+            print(f"\nSample treatment chunk:")
+            print(f"  Primary keyword: {sample_chunk['primary_keyword']}")
+            print(f"  Emergency keywords: {sample_chunk['emergency_keywords']}")
+            print(f"  Text length: {len(sample_chunk['text'])}")
+            print(f"  Text preview: {sample_chunk['text'][:100]}...")
+        return True
+    except Exception as e:
+        print(f"❌ Chunking test failed: {e}")
+        import traceback
+        traceback.print_exc()
+        return False
+def test_model_loading():
+    """Test if we can load the embedding model"""
+    print("\n" + "="*50)
+    print("TESTING MODEL LOADING")
+    print("="*50)
+    try:
+        processor = DataProcessor()
+        print("Loading NeuML/pubmedbert-base-embeddings...")
+        model = processor.load_embedding_model()
+        print(f"✅ Model loaded successfully: {processor.embedding_model_name}")
+        print(f"✅ Model max sequence length: {model.max_seq_length}")
+        # Test a simple encoding
+        test_text = "Patient presents with chest pain and shortness of breath."
+        embedding = model.encode([test_text])
+        print(f"✅ Test embedding shape: {embedding.shape}")
+        print(f"✅ Expected dimension: {processor.embedding_dim}")
+        assert embedding.shape[1] == processor.embedding_dim, f"Dimension mismatch: {embedding.shape[1]} != {processor.embedding_dim}"
+        return True
+    except Exception as e:
+        print(f"❌ Model loading failed: {e}")
+        import traceback
+        traceback.print_exc()
+        return False
+def test_token_chunking():
+    """Test token-based chunking functionality"""
+    try:
+        processor = DataProcessor()
+        test_text = "Patient presents with acute chest pain radiating to left arm. Initial ECG shows ST elevation."
+        test_keywords = "chest pain|ST elevation"
+        chunks = processor.create_keyword_centered_chunks(
+            text=test_text,
+            matched_keywords=test_keywords
+        )
+        print(f"\nToken chunking test:")
+        print(f"✓ Generated {len(chunks)} chunks")
+        for i, chunk in enumerate(chunks, 1):
+            print(f"\nChunk {i}:")
+            print(f"  Primary keyword: {chunk['primary_keyword']}")
+            print(f"  Content: {chunk['text']}")
+        return True
+    except Exception as e:
+        print(f"❌ Token chunking test failed: {e}")
+        return False
+def main():
+    """Run all tests"""
+    print("Starting data processing tests...\n")
+    # Import pandas here since it's used in chunking test
+    import pandas as pd
+    tests = [
+        test_data_loading,
+        test_chunking,
+        test_model_loading,
+        test_token_chunking  # Added new test
+    ]
+    results = []
+    for test in tests:
+        result = test()
+        results.append(result)
+    print("\n" + "="*50)
+    print("TEST SUMMARY")
+    print("="*50)
+    for i, (test, result) in enumerate(zip(tests, results), 1):
+        status = "✅ PASSED" if result else "❌ FAILED"
+        print(f"{i}. {test.__name__}: {status}")
+    all_passed = all(results)
+    if all_passed:
+        print("\n🎉 All tests passed! Ready to proceed with full pipeline.")
+        print("\nTo run the full data processing pipeline:")
+        print("cd FinalProject && python src/data_processing.py")
+    else:
+        print("\n⚠️  Some tests failed. Please check the issues above.")
+    return all_passed
+if __name__ == "__main__":
+    main()

tests/test_embedding_and_index.py ADDED Viewed

	@@ -0,0 +1,29 @@

+import numpy as np
+from annoy import AnnoyIndex
+import pytest
+from data_processing import DataProcessor
+@pytest.fixture(scope="module")
+def processor():
+    return DataProcessor(base_dir=".")
+def test_embedding_dimensions(processor):
+    # load emergency embeddings
+    emb = np.load(processor.models_dir / "embeddings" / "emergency_embeddings.npy")
+    expected_dim = processor.embedding_dim
+    assert emb.ndim == 2, f"Expected 2D array, got {emb.ndim}D"
+    assert emb.shape[1] == expected_dim, (
+        f"Expected embedding dimension {expected_dim}, got {emb.shape[1]}"
+    )
+def test_annoy_search(processor):
+    # load embeddings
+    emb = np.load(processor.models_dir / "embeddings" / "emergency_embeddings.npy")
+    # load Annoy index
+    idx = AnnoyIndex(processor.embedding_dim, 'angular')
+    idx.load(str(processor.models_dir / "indices" / "annoy" / "emergency_index.ann"))
+    # perform a sample query
+    query_vec = emb[0]
+    ids, distances = idx.get_nns_by_vector(query_vec, 5, include_distances=True)
+    assert len(ids) == 5
+    assert all(0 <= d <= 2 for d in distances)

tests/test_embedding_validation.py ADDED Viewed

	@@ -0,0 +1,213 @@

+"""
+Test suite for validating embeddings and ANNOY functionality.
+This module ensures the quality of embeddings and the correctness of ANNOY search.
+"""
+import numpy as np
+import json
+import logging
+import os
+from pathlib import Path
+from typing import Tuple, List, Optional
+from annoy import AnnoyIndex
+from sentence_transformers import SentenceTransformer
+class TestEmbeddingValidation:
+    def setup_class(self):
+        """Initialize test environment with necessary data and models."""
+        # Setup logging
+        logging.basicConfig(
+            level=logging.DEBUG,
+            format='%(asctime)s - %(levelname)s - %(message)s',
+            filename='embedding_validation.log'
+        )
+        self.logger = logging.getLogger(__name__)
+        # Define base paths
+        self.project_root = Path(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+        self.models_dir = self.project_root / "models"
+        self.embeddings_dir = self.models_dir / "embeddings"
+        self.indices_dir = self.models_dir / "indices" / "annoy"
+        self.logger.info(f"Project root: {self.project_root}")
+        self.logger.info(f"Models directory: {self.models_dir}")
+        self.logger.info(f"Embeddings directory: {self.embeddings_dir}")
+        try:
+            # Check directory existence
+            if not self.embeddings_dir.exists():
+                raise FileNotFoundError(f"Embeddings directory not found at: {self.embeddings_dir}")
+            if not self.indices_dir.exists():
+                raise FileNotFoundError(f"Indices directory not found at: {self.indices_dir}")
+            # Load embeddings
+            self.emergency_emb = np.load(self.embeddings_dir / "emergency_embeddings.npy")
+            self.treatment_emb = np.load(self.embeddings_dir / "treatment_embeddings.npy")
+            # Load chunks
+            with open(self.embeddings_dir / "emergency_chunks.json", 'r') as f:
+                self.emergency_chunks = json.load(f)
+            with open(self.embeddings_dir / "treatment_chunks.json", 'r') as f:
+                self.treatment_chunks = json.load(f)
+            # Initialize model
+            self.model = SentenceTransformer("NeuML/pubmedbert-base-embeddings")
+            self.logger.info("Test environment initialized successfully")
+            self.logger.info(f"Emergency embeddings shape: {self.emergency_emb.shape}")
+            self.logger.info(f"Treatment embeddings shape: {self.treatment_emb.shape}")
+        except FileNotFoundError as e:
+            self.logger.error(f"File not found: {e}")
+            raise
+        except Exception as e:
+            self.logger.error(f"Error during initialization: {e}")
+            raise
+    def _safe_search(
+        self,
+        index: AnnoyIndex,
+        query_vector: np.ndarray,
+        k: int = 5
+    ) -> Tuple[Optional[List[int]], Optional[List[float]]]:
+        """Safe search wrapper with error handling"""
+        try:
+            indices, distances = index.get_nns_by_vector(
+                query_vector, k, include_distances=True
+            )
+            self.logger.debug(f"Search successful: found {len(indices)} results")
+            return indices, distances
+        except Exception as e:
+            self.logger.error(f"Search failed: {str(e)}")
+            return None, None
+    def test_embedding_dimensions(self):
+        """Test embedding dimensions and data quality."""
+        self.logger.info("\n=== Embedding Validation Report ===")
+        try:
+            # Basic dimension checks
+            assert self.emergency_emb.shape[1] == 768, "Emergency embedding dimension should be 768"
+            assert self.treatment_emb.shape[1] == 768, "Treatment embedding dimension should be 768"
+            # Count verification
+            assert len(self.emergency_chunks) == self.emergency_emb.shape[0], \
+                "Emergency chunks count mismatch"
+            assert len(self.treatment_chunks) == self.treatment_emb.shape[0], \
+                "Treatment chunks count mismatch"
+            # Data quality checks
+            for name, emb in [("Emergency", self.emergency_emb),
+                             ("Treatment", self.treatment_emb)]:
+                # Check for NaN and Inf
+                assert not np.isnan(emb).any(), f"{name} contains NaN values"
+                assert not np.isinf(emb).any(), f"{name} contains Inf values"
+                # Value distribution analysis
+                self.logger.info(f"\n{name} Embeddings Statistics:")
+                self.logger.info(f"- Range: {np.min(emb):.3f} to {np.max(emb):.3f}")
+                self.logger.info(f"- Mean: {np.mean(emb):.3f}")
+                self.logger.info(f"- Std: {np.std(emb):.3f}")
+            self.logger.info("\n✅ All embedding validations passed")
+        except AssertionError as e:
+            self.logger.error(f"Validation failed: {str(e)}")
+            raise
+    def test_multiple_known_item_search(self):
+        """Test ANNOY search with multiple random samples."""
+        self.logger.info("\n=== Multiple Known-Item Search Test ===")
+        emergency_index = AnnoyIndex(768, 'angular')
+        emergency_index.load(str(self.indices_dir / "emergency_index.ann"))
+        # Test 20 random samples
+        test_indices = np.random.choice(
+            self.emergency_emb.shape[0],
+            size=20,
+            replace=False
+        )
+        success_count = 0
+        for test_idx in test_indices:
+            try:
+                test_emb = self.emergency_emb[test_idx]
+                indices, distances = self._safe_search(emergency_index, test_emb)
+                if indices is None:
+                    continue
+                # Verify self-retrieval
+                assert indices[0] == test_idx, f"Self-retrieval failed for index {test_idx}"
+                assert distances[0] < 0.0001, f"Self-distance too large for index {test_idx}"
+                success_count += 1
+            except AssertionError as e:
+                self.logger.warning(f"Test failed for index {test_idx}: {str(e)}")
+        self.logger.info(f"\n✅ {success_count}/20 self-retrieval tests passed")
+        assert success_count >= 18, "Less than 90% of self-retrieval tests passed"
+    def test_balanced_cross_dataset_search(self):
+        """Test search across both emergency and treatment datasets."""
+        self.logger.info("\n=== Balanced Cross-Dataset Search Test ===")
+        # Initialize indices
+        emergency_index = AnnoyIndex(768, 'angular')
+        treatment_index = AnnoyIndex(768, 'angular')
+        try:
+            emergency_index.load(str(self.indices_dir / "emergency_index.ann"))
+            treatment_index.load(str(self.indices_dir / "treatment_index.ann"))
+            # Test queries
+            test_queries = [
+                "What is the treatment protocol for acute myocardial infarction?",
+                "How to manage severe chest pain with difficulty breathing?",
+                "What are the emergency procedures for anaphylactic shock?"
+            ]
+            for query in test_queries:
+                print(f"\n\n=== Query: {query} ===")
+                # Generate query vector
+                query_emb = self.model.encode([query])[0]
+                # Get top-5 results from each dataset
+                e_indices, e_distances = self._safe_search(emergency_index, query_emb, k=5)
+                t_indices, t_distances = self._safe_search(treatment_index, query_emb, k=5)
+                if None in [e_indices, e_distances, t_indices, t_distances]:
+                    self.logger.error("Search failed for one or both datasets")
+                    continue
+                # Print first sentence of each result
+                print("\nEmergency Dataset Results:")
+                for i, (idx, dist) in enumerate(zip(e_indices, e_distances), 1):
+                    text = self.emergency_chunks[idx]['text']
+                    first_sentence = text.split('.')[0] + '.'
+                    print(f"\nE-{i} (distance: {dist:.3f}):")
+                    print(first_sentence)
+                print("\nTreatment Dataset Results:")
+                for i, (idx, dist) in enumerate(zip(t_indices, t_distances), 1):
+                    text = self.treatment_chunks[idx]['text']
+                    first_sentence = text.split('.')[0] + '.'
+                    print(f"\nT-{i} (distance: {dist:.3f}):")
+                    print(first_sentence)
+        except Exception as e:
+            self.logger.error(f"Test failed: {str(e)}")
+            raise
+        else:
+            self.logger.info("\n✅ Cross-dataset search test completed")
+if __name__ == "__main__":
+    # Manual test execution
+    test = TestEmbeddingValidation()
+    test.setup_class()
+    test.test_embedding_dimensions()
+    test.test_multiple_known_item_search()
+    test.test_balanced_cross_dataset_search()