Spaces:
Sleeping
Sleeping
Merge pull request #2 from YanBoChen0928/embedding
Browse files### 🔧 Git History Cleanup: Removed Large Files + Forced Push
- .gitignore +29 -5
- dataset/analysis/integrity_check/integrity_check_report.json +0 -29
- dataset/analysis/keyword_matching_test_results.json +0 -151
- dataset/analysis/stats/analysis_stats_emergency_subset.json +0 -55
- dataset/analysis/stats/analysis_stats_emergency_subset_opt.json +0 -55
- dataset/analysis_treatment/stats/treatment_analysis_comprehensive.json +0 -293
- dataset/scripts/data_explorer_treatment.py +1 -1
- requirements.txt +1 -0
- src/__init__.py +8 -0
- src/data_processing.py +698 -0
- tests/embedding_test_analysis.md +355 -0
- tests/test_data_processing.py +228 -0
- tests/test_embedding_and_index.py +29 -0
- tests/test_embedding_validation.py +213 -0
.gitignore
CHANGED
@@ -1,10 +1,34 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
#virtual environment
|
4 |
genAIvenv/
|
5 |
.final_project_env/
|
6 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
|
8 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
|
10 |
-
|
|
|
|
|
|
|
|
1 |
+
# 🧠 Virtual environments
|
|
|
|
|
2 |
genAIvenv/
|
3 |
.final_project_env/
|
4 |
|
5 |
+
# 💻 OS / Editor garbage
|
6 |
+
.DS_Store
|
7 |
+
.vscode/
|
8 |
+
|
9 |
+
# 📁 Documentation and project folders
|
10 |
+
docs/
|
11 |
+
dataset/dataset/
|
12 |
+
|
13 |
+
# 🧾 Compiled / output files
|
14 |
+
*.pyc
|
15 |
+
*.log
|
16 |
+
*.zip
|
17 |
+
*.tar.gz
|
18 |
+
*.mp4
|
19 |
+
*.mov
|
20 |
+
*.json
|
21 |
+
*.png
|
22 |
|
23 |
+
# 🚫 Large files - models
|
24 |
+
models/cache/
|
25 |
+
models/cache/*.pkl
|
26 |
+
models/embeddings/*.npy
|
27 |
+
models/embeddings/*.json
|
28 |
+
models/indices/
|
29 |
+
models/indices/annoy/*.ann
|
30 |
|
31 |
+
# 🚫 Redundant catch-all for large file extensions
|
32 |
+
*.pkl
|
33 |
+
*.npy
|
34 |
+
*.ann
|
dataset/analysis/integrity_check/integrity_check_report.json
DELETED
@@ -1,29 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"sample_analysis": {
|
3 |
-
"matched": {
|
4 |
-
"non_null": 100,
|
5 |
-
"non_empty": 100,
|
6 |
-
"unique_values": 84
|
7 |
-
},
|
8 |
-
"treatment_matched": {
|
9 |
-
"non_null": 100,
|
10 |
-
"non_empty": 100,
|
11 |
-
"unique_values": 100
|
12 |
-
}
|
13 |
-
},
|
14 |
-
"full_file_analysis": {
|
15 |
-
"total_records": 9367,
|
16 |
-
"matched_column": {
|
17 |
-
"non_null_count": 9367,
|
18 |
-
"non_empty_count": 9367,
|
19 |
-
"null_percentage": 0.0
|
20 |
-
},
|
21 |
-
"treatment_matched_column": {
|
22 |
-
"non_null_count": 9367,
|
23 |
-
"non_empty_count": 9367,
|
24 |
-
"null_percentage": 0.0
|
25 |
-
},
|
26 |
-
"both_matched_count": 3315,
|
27 |
-
"both_matched_percentage": 35.39019963702359
|
28 |
-
}
|
29 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
dataset/analysis/keyword_matching_test_results.json
DELETED
@@ -1,151 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"special_terms_matching": [
|
3 |
-
{
|
4 |
-
"clean_text": "Patient needs an x-ray of the chest",
|
5 |
-
"category": "x-ray variants",
|
6 |
-
"matched": "x-ray"
|
7 |
-
},
|
8 |
-
{
|
9 |
-
"clean_text": "Ordered chest xray",
|
10 |
-
"category": "x-ray variants",
|
11 |
-
"matched": "xray"
|
12 |
-
},
|
13 |
-
{
|
14 |
-
"clean_text": "X ray shows pneumonia",
|
15 |
-
"category": "x-ray variants",
|
16 |
-
"matched": "X ray"
|
17 |
-
},
|
18 |
-
{
|
19 |
-
"clean_text": "XRAY negative",
|
20 |
-
"category": "x-ray variants",
|
21 |
-
"matched": "XRAY"
|
22 |
-
},
|
23 |
-
{
|
24 |
-
"clean_text": "CT scan reveals nodule",
|
25 |
-
"category": "ct-scan variants",
|
26 |
-
"matched": "CT scan"
|
27 |
-
},
|
28 |
-
{
|
29 |
-
"clean_text": "CT-scan indicates mass",
|
30 |
-
"category": "ct-scan variants",
|
31 |
-
"matched": "CT-scan"
|
32 |
-
},
|
33 |
-
{
|
34 |
-
"clean_text": "Requires ctscan urgently",
|
35 |
-
"category": "ct-scan variants",
|
36 |
-
"matched": "ctscan"
|
37 |
-
},
|
38 |
-
{
|
39 |
-
"clean_text": "CTSCAN of abdomen",
|
40 |
-
"category": "ct-scan variants",
|
41 |
-
"matched": "CTSCAN"
|
42 |
-
},
|
43 |
-
{
|
44 |
-
"clean_text": "Point-of-care testing needed",
|
45 |
-
"category": "point-of-care variants",
|
46 |
-
"matched": "Point-of-care"
|
47 |
-
},
|
48 |
-
{
|
49 |
-
"clean_text": "Point of care ultrasound",
|
50 |
-
"category": "point-of-care variants",
|
51 |
-
"matched": "Point of care"
|
52 |
-
},
|
53 |
-
{
|
54 |
-
"clean_text": "POC testing results",
|
55 |
-
"category": "point-of-care variants",
|
56 |
-
"matched": ""
|
57 |
-
},
|
58 |
-
{
|
59 |
-
"clean_text": "Ordered both x-ray and CT scan",
|
60 |
-
"category": "mixed cases",
|
61 |
-
"matched": "x-ray|CT scan"
|
62 |
-
},
|
63 |
-
{
|
64 |
-
"clean_text": "XRAY and CTSCAN negative",
|
65 |
-
"category": "mixed cases",
|
66 |
-
"matched": "XRAY|CTSCAN"
|
67 |
-
},
|
68 |
-
{
|
69 |
-
"clean_text": "Multiple point-of-care tests with x-ray",
|
70 |
-
"category": "mixed cases",
|
71 |
-
"matched": "point-of-care|x-ray"
|
72 |
-
},
|
73 |
-
{
|
74 |
-
"clean_text": "No imaging mentioned",
|
75 |
-
"category": "negative cases",
|
76 |
-
"matched": ""
|
77 |
-
},
|
78 |
-
{
|
79 |
-
"clean_text": "Regular examination only",
|
80 |
-
"category": "negative cases",
|
81 |
-
"matched": ""
|
82 |
-
},
|
83 |
-
{
|
84 |
-
"clean_text": "Laboratory tests pending",
|
85 |
-
"category": "negative cases",
|
86 |
-
"matched": ""
|
87 |
-
}
|
88 |
-
],
|
89 |
-
"basic_matching": [
|
90 |
-
{
|
91 |
-
"clean_text": "Emergency treatment required",
|
92 |
-
"category": "simple matches",
|
93 |
-
"matched": "Emergency"
|
94 |
-
},
|
95 |
-
{
|
96 |
-
"clean_text": "Acute condition observed",
|
97 |
-
"category": "simple matches",
|
98 |
-
"matched": "Acute"
|
99 |
-
},
|
100 |
-
{
|
101 |
-
"clean_text": "Urgent care needed",
|
102 |
-
"category": "simple matches",
|
103 |
-
"matched": "Urgent"
|
104 |
-
},
|
105 |
-
{
|
106 |
-
"clean_text": "EMERGENCY situation",
|
107 |
-
"category": "case variations",
|
108 |
-
"matched": "EMERGENCY"
|
109 |
-
},
|
110 |
-
{
|
111 |
-
"clean_text": "Acute RESPIRATORY failure",
|
112 |
-
"category": "case variations",
|
113 |
-
"matched": "Acute"
|
114 |
-
},
|
115 |
-
{
|
116 |
-
"clean_text": "URgent surgical intervention",
|
117 |
-
"category": "case variations",
|
118 |
-
"matched": "URgent"
|
119 |
-
},
|
120 |
-
{
|
121 |
-
"clean_text": "Emergency treatment for acute condition",
|
122 |
-
"category": "multiple matches",
|
123 |
-
"matched": "Emergency|acute"
|
124 |
-
},
|
125 |
-
{
|
126 |
-
"clean_text": "Urgent care in emergency department",
|
127 |
-
"category": "multiple matches",
|
128 |
-
"matched": "Urgent|emergency"
|
129 |
-
},
|
130 |
-
{
|
131 |
-
"clean_text": "Acute respiratory emergency",
|
132 |
-
"category": "multiple matches",
|
133 |
-
"matched": "Acute|emergency"
|
134 |
-
},
|
135 |
-
{
|
136 |
-
"clean_text": "Non-emergency situation",
|
137 |
-
"category": "partial words",
|
138 |
-
"matched": "emergency"
|
139 |
-
},
|
140 |
-
{
|
141 |
-
"clean_text": "Subacute condition",
|
142 |
-
"category": "partial words",
|
143 |
-
"matched": ""
|
144 |
-
},
|
145 |
-
{
|
146 |
-
"clean_text": "Emergency-related",
|
147 |
-
"category": "partial words",
|
148 |
-
"matched": "Emergency"
|
149 |
-
}
|
150 |
-
]
|
151 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
dataset/analysis/stats/analysis_stats_emergency_subset.json
DELETED
@@ -1,55 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"basic_statistics": {
|
3 |
-
"total_records": 10282,
|
4 |
-
"avg_length": 25185.078194903715
|
5 |
-
},
|
6 |
-
"keyword_statistics": {
|
7 |
-
"Acute abdomen": 52,
|
8 |
-
"Acute bleeding": 31,
|
9 |
-
"Acute Coronary Syndrome": 345,
|
10 |
-
"Acute Kidney Injury": 202,
|
11 |
-
"Acute pancreatitis": 214,
|
12 |
-
"Acute respiratory distress syndrome": 231,
|
13 |
-
"Acute stroke": 67,
|
14 |
-
"Anaphylaxis": 1016,
|
15 |
-
"Anaphylactic Shock": 153,
|
16 |
-
"Arrhythmia": 1547,
|
17 |
-
"Atrial fibrillation": 771,
|
18 |
-
"Atrial flutter": 146,
|
19 |
-
"Bradycardia": 884,
|
20 |
-
"Cardiac arrest": 614,
|
21 |
-
"Cardiogenic Shock": 196,
|
22 |
-
"Chest pain": 1433,
|
23 |
-
"Dyspnea": 1319,
|
24 |
-
"Fever": 4270,
|
25 |
-
"Gastrointestinal Hemorrhage": 158,
|
26 |
-
"GI bleeding": 105,
|
27 |
-
"Hemorrhage": 1611,
|
28 |
-
"Hemorrhagic stroke": 117,
|
29 |
-
"Hyperthermia": 305,
|
30 |
-
"Hypovolemic Shock": 63,
|
31 |
-
"Hypotension": 1929,
|
32 |
-
"Hypothermia": 356,
|
33 |
-
"Internal bleeding": 70,
|
34 |
-
"Intracranial Hemorrhages": 6,
|
35 |
-
"Ischemic stroke": 224,
|
36 |
-
"Loss of consciousness": 422,
|
37 |
-
"Myocardial Infarction": 1708,
|
38 |
-
"MI": 10183,
|
39 |
-
"Pulmonary Edema": 487,
|
40 |
-
"Pulmonary Embolism": 654,
|
41 |
-
"Respiratory distress": 730,
|
42 |
-
"Respiratory failure": 579,
|
43 |
-
"Sepsis": 1181,
|
44 |
-
"Severe Sepsis": 81,
|
45 |
-
"Septic Shock": 244,
|
46 |
-
"Shock": 1881,
|
47 |
-
"Status Epilepticus": 150,
|
48 |
-
"Syncope": 834,
|
49 |
-
"Tachycardia": 1650,
|
50 |
-
"Tachypnea": 268,
|
51 |
-
"Traumatic Brain Injury": 171,
|
52 |
-
"Ventricular Tachycardia": 491,
|
53 |
-
"Ventricular fibrillation": 295
|
54 |
-
}
|
55 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
dataset/analysis/stats/analysis_stats_emergency_subset_opt.json
DELETED
@@ -1,55 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"basic_statistics": {
|
3 |
-
"total_records": 11914,
|
4 |
-
"avg_length": 23847.07579318449
|
5 |
-
},
|
6 |
-
"keyword_statistics": {
|
7 |
-
"Acute abdomen": 52,
|
8 |
-
"Acute bleeding": 31,
|
9 |
-
"Acute Coronary Syndrome": 351,
|
10 |
-
"Acute Kidney Injury": 202,
|
11 |
-
"Acute pancreatitis": 214,
|
12 |
-
"Acute respiratory distress syndrome": 231,
|
13 |
-
"Acute stroke": 67,
|
14 |
-
"Anaphylaxis": 1016,
|
15 |
-
"Anaphylactic Shock": 153,
|
16 |
-
"Arrhythmia": 1564,
|
17 |
-
"Atrial fibrillation": 771,
|
18 |
-
"Atrial flutter": 146,
|
19 |
-
"Bradycardia": 884,
|
20 |
-
"Cardiac arrest": 614,
|
21 |
-
"Cardiogenic Shock": 196,
|
22 |
-
"Chest pain": 1434,
|
23 |
-
"Dyspnea": 1319,
|
24 |
-
"Fever": 4279,
|
25 |
-
"Gastrointestinal Hemorrhage": 158,
|
26 |
-
"GI bleeding": 105,
|
27 |
-
"Hemorrhage": 1621,
|
28 |
-
"Hemorrhagic stroke": 117,
|
29 |
-
"Hyperthermia": 305,
|
30 |
-
"Hypovolemic Shock": 63,
|
31 |
-
"Hypotension": 1929,
|
32 |
-
"Hypothermia": 356,
|
33 |
-
"Internal bleeding": 70,
|
34 |
-
"Intracranial Hemorrhages": 6,
|
35 |
-
"Ischemic stroke": 225,
|
36 |
-
"Loss of consciousness": 422,
|
37 |
-
"Myocardial Infarction": 1710,
|
38 |
-
"MI": 11773,
|
39 |
-
"Pulmonary Edema": 487,
|
40 |
-
"Pulmonary Embolism": 654,
|
41 |
-
"Respiratory distress": 730,
|
42 |
-
"Respiratory failure": 579,
|
43 |
-
"Sepsis": 1188,
|
44 |
-
"Severe Sepsis": 81,
|
45 |
-
"Septic Shock": 244,
|
46 |
-
"Shock": 1892,
|
47 |
-
"Status Epilepticus": 150,
|
48 |
-
"Syncope": 834,
|
49 |
-
"Tachycardia": 1651,
|
50 |
-
"Tachypnea": 268,
|
51 |
-
"Traumatic Brain Injury": 171,
|
52 |
-
"Ventricular Tachycardia": 492,
|
53 |
-
"Ventricular fibrillation": 295
|
54 |
-
}
|
55 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
dataset/analysis_treatment/stats/treatment_analysis_comprehensive.json
DELETED
@@ -1,293 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"basic_statistics": {
|
3 |
-
"total_records": 9367,
|
4 |
-
"avg_text_length": 27179.22952919825,
|
5 |
-
"emergency_keywords_count": 47,
|
6 |
-
"treatment_keywords_count": 105
|
7 |
-
},
|
8 |
-
"emergency_keyword_stats": {
|
9 |
-
"Acute abdomen": 51,
|
10 |
-
"Acute bleeding": 31,
|
11 |
-
"Acute Coronary Syndrome": 332,
|
12 |
-
"Acute Kidney Injury": 200,
|
13 |
-
"Acute pancreatitis": 202,
|
14 |
-
"Acute respiratory distress syndrome": 225,
|
15 |
-
"Acute stroke": 65,
|
16 |
-
"Anaphylaxis": 1002,
|
17 |
-
"Anaphylactic Shock": 148,
|
18 |
-
"Arrhythmia": 1490,
|
19 |
-
"Atrial fibrillation": 736,
|
20 |
-
"Atrial flutter": 139,
|
21 |
-
"Bradycardia": 845,
|
22 |
-
"Cardiac arrest": 600,
|
23 |
-
"Cardiogenic Shock": 192,
|
24 |
-
"Chest pain": 1408,
|
25 |
-
"Dyspnea": 1296,
|
26 |
-
"Fever": 4008,
|
27 |
-
"Gastrointestinal Hemorrhage": 158,
|
28 |
-
"GI bleeding": 103,
|
29 |
-
"Hemorrhage": 1532,
|
30 |
-
"Hemorrhagic stroke": 109,
|
31 |
-
"Hyperthermia": 283,
|
32 |
-
"Hypovolemic Shock": 61,
|
33 |
-
"Hypotension": 1897,
|
34 |
-
"Hypothermia": 340,
|
35 |
-
"Internal bleeding": 67,
|
36 |
-
"Intracranial Hemorrhages": 5,
|
37 |
-
"Ischemic stroke": 216,
|
38 |
-
"Loss of consciousness": 406,
|
39 |
-
"Myocardial Infarction": 1607,
|
40 |
-
"MI": 9316,
|
41 |
-
"Pulmonary Edema": 471,
|
42 |
-
"Pulmonary Embolism": 624,
|
43 |
-
"Respiratory distress": 713,
|
44 |
-
"Respiratory failure": 554,
|
45 |
-
"Sepsis": 1145,
|
46 |
-
"Severe Sepsis": 81,
|
47 |
-
"Septic Shock": 231,
|
48 |
-
"Shock": 1702,
|
49 |
-
"Status Epilepticus": 149,
|
50 |
-
"Syncope": 806,
|
51 |
-
"Tachycardia": 1576,
|
52 |
-
"Tachypnea": 262,
|
53 |
-
"Traumatic Brain Injury": 151,
|
54 |
-
"Ventricular Tachycardia": 461,
|
55 |
-
"Ventricular fibrillation": 280
|
56 |
-
},
|
57 |
-
"treatment_keyword_stats": {
|
58 |
-
"ACLS": 30,
|
59 |
-
"administer": 3881,
|
60 |
-
"Adrenaline": 135,
|
61 |
-
"Advanced Cardiac Life Support": 34,
|
62 |
-
"Airway Management": 174,
|
63 |
-
"alpha blocker": 35,
|
64 |
-
"Amiodarone": 315,
|
65 |
-
"analgesia": 323,
|
66 |
-
"Anesthesia Procedural": 0,
|
67 |
-
"Anti-Bacterial Agents": 1,
|
68 |
-
"antibiotic": 1922,
|
69 |
-
"arterial line placement": 0,
|
70 |
-
"beta blocker": 297,
|
71 |
-
"Bi-level Positive Airway Pressure": 6,
|
72 |
-
"bipap": 25,
|
73 |
-
"Blood Transfusion": 379,
|
74 |
-
"Bosmin": 0,
|
75 |
-
"Cardiopulmonary Resuscitation": 131,
|
76 |
-
"Cardioversion": 142,
|
77 |
-
"Catheterization Arterial": 0,
|
78 |
-
"Catheterization Central Venous": 0,
|
79 |
-
"central line placement": 6,
|
80 |
-
"compression dressing": 2,
|
81 |
-
"Computed Tomography": 518,
|
82 |
-
"cpap": 84,
|
83 |
-
"cpr": 151,
|
84 |
-
"crystalloids": 45,
|
85 |
-
"ct scan": 1036,
|
86 |
-
"Defibrillation": 96,
|
87 |
-
"Dopamine": 389,
|
88 |
-
"Dosage Forms": 210,
|
89 |
-
"dose": 5344,
|
90 |
-
"Drug Administration Routes": 0,
|
91 |
-
"Drug Therapy": 773,
|
92 |
-
"Epinephrine": 806,
|
93 |
-
"fluid": 2938,
|
94 |
-
"fluid resuscitation": 115,
|
95 |
-
"hemodynamic monitoring": 43,
|
96 |
-
"Hemodynamics": 135,
|
97 |
-
"Hemostasis": 180,
|
98 |
-
"Ibuprofen": 269,
|
99 |
-
"icu transfer": 9,
|
100 |
-
"Insulin": 808,
|
101 |
-
"intervention": 2695,
|
102 |
-
"intubation": 493,
|
103 |
-
"Intratracheal Intubation": 3,
|
104 |
-
"Intravenous Infusion": 576,
|
105 |
-
"iv fluids": 75,
|
106 |
-
"laboratory techniques": 29,
|
107 |
-
"laboratory testing": 296,
|
108 |
-
"levophed": 11,
|
109 |
-
"Lidocaine": 212,
|
110 |
-
"manage": 4416,
|
111 |
-
"management": 4008,
|
112 |
-
"medication": 4698,
|
113 |
-
"midazolam": 204,
|
114 |
-
"monitor": 4521,
|
115 |
-
"monitoring": 3593,
|
116 |
-
"Morphine": 289,
|
117 |
-
"Nebulization": 41,
|
118 |
-
"nitroglycerin": 125,
|
119 |
-
"NTG": 81,
|
120 |
-
"Norepinephrine": 392,
|
121 |
-
"normal saline": 252,
|
122 |
-
"Ondansetron": 43,
|
123 |
-
"Oxygen": 1779,
|
124 |
-
"Oxygen Inhalation Therapy": 2,
|
125 |
-
"oxygen therapy": 178,
|
126 |
-
"Patient Management": 281,
|
127 |
-
"Patient Monitoring": 107,
|
128 |
-
"POCUS": 10,
|
129 |
-
"point of care ultrasound": 2,
|
130 |
-
"procedural sedation": 26,
|
131 |
-
"procedure": 3073,
|
132 |
-
"radiologic imaging": 5,
|
133 |
-
"Radiography": 218,
|
134 |
-
"resuscitation": 539,
|
135 |
-
"Sedation": 602,
|
136 |
-
"splinting": 26,
|
137 |
-
"Splints": 29,
|
138 |
-
"supportive care": 564,
|
139 |
-
"surgical procedures": 482,
|
140 |
-
"Surgical Procedures Operative": 0,
|
141 |
-
"surgery": 3531,
|
142 |
-
"Suture": 179,
|
143 |
-
"Suturing": 53,
|
144 |
-
"Therapeutic Intervention": 181,
|
145 |
-
"Therapeutics": 182,
|
146 |
-
"Therapy": 6117,
|
147 |
-
"tourniquet": 56,
|
148 |
-
"transfusion": 826,
|
149 |
-
"treat": 8270,
|
150 |
-
"treatment": 7719,
|
151 |
-
"Ultrasonography Point of Care": 0,
|
152 |
-
"ultrasound": 1273,
|
153 |
-
"Vasoconstrictor Agents": 2,
|
154 |
-
"vasopressors": 188,
|
155 |
-
"ventilation support": 14,
|
156 |
-
"Ventilators": 86,
|
157 |
-
"Vital Signs": 459,
|
158 |
-
"vital signs monitoring": 1,
|
159 |
-
"wound care": 73,
|
160 |
-
"Wound Dressing": 30,
|
161 |
-
"Wound Management": 37,
|
162 |
-
"X-Ray": 1293
|
163 |
-
},
|
164 |
-
"cooccurrence_analysis": [
|
165 |
-
{
|
166 |
-
"emergency_keyword": "Fever",
|
167 |
-
"treatment_keyword": "treatment",
|
168 |
-
"cooccurrence_count": 3488,
|
169 |
-
"percentage": 37.23710899967973
|
170 |
-
},
|
171 |
-
{
|
172 |
-
"emergency_keyword": "Fever",
|
173 |
-
"treatment_keyword": "Therapy",
|
174 |
-
"cooccurrence_count": 2698,
|
175 |
-
"percentage": 28.803245436105477
|
176 |
-
},
|
177 |
-
{
|
178 |
-
"emergency_keyword": "Fever",
|
179 |
-
"treatment_keyword": "dose",
|
180 |
-
"cooccurrence_count": 2430,
|
181 |
-
"percentage": 25.94213729048788
|
182 |
-
},
|
183 |
-
{
|
184 |
-
"emergency_keyword": "Fever",
|
185 |
-
"treatment_keyword": "medication",
|
186 |
-
"cooccurrence_count": 1979,
|
187 |
-
"percentage": 21.127362015586634
|
188 |
-
},
|
189 |
-
{
|
190 |
-
"emergency_keyword": "Hypotension",
|
191 |
-
"treatment_keyword": "treatment",
|
192 |
-
"cooccurrence_count": 1760,
|
193 |
-
"percentage": 18.789366926443897
|
194 |
-
},
|
195 |
-
{
|
196 |
-
"emergency_keyword": "Fever",
|
197 |
-
"treatment_keyword": "management",
|
198 |
-
"cooccurrence_count": 1753,
|
199 |
-
"percentage": 18.714636489804633
|
200 |
-
},
|
201 |
-
{
|
202 |
-
"emergency_keyword": "Fever",
|
203 |
-
"treatment_keyword": "treat",
|
204 |
-
"cooccurrence_count": 1744,
|
205 |
-
"percentage": 18.618554499839863
|
206 |
-
},
|
207 |
-
{
|
208 |
-
"emergency_keyword": "Fever",
|
209 |
-
"treatment_keyword": "monitoring",
|
210 |
-
"cooccurrence_count": 1674,
|
211 |
-
"percentage": 17.87125013344721
|
212 |
-
},
|
213 |
-
{
|
214 |
-
"emergency_keyword": "Hypotension",
|
215 |
-
"treatment_keyword": "Therapy",
|
216 |
-
"cooccurrence_count": 1558,
|
217 |
-
"percentage": 16.63286004056795
|
218 |
-
},
|
219 |
-
{
|
220 |
-
"emergency_keyword": "Fever",
|
221 |
-
"treatment_keyword": "surgery",
|
222 |
-
"cooccurrence_count": 1505,
|
223 |
-
"percentage": 16.06704387744208
|
224 |
-
},
|
225 |
-
{
|
226 |
-
"emergency_keyword": "Tachycardia",
|
227 |
-
"treatment_keyword": "treatment",
|
228 |
-
"cooccurrence_count": 1441,
|
229 |
-
"percentage": 15.383794171025942
|
230 |
-
},
|
231 |
-
{
|
232 |
-
"emergency_keyword": "Hypotension",
|
233 |
-
"treatment_keyword": "dose",
|
234 |
-
"cooccurrence_count": 1423,
|
235 |
-
"percentage": 15.191630191096403
|
236 |
-
},
|
237 |
-
{
|
238 |
-
"emergency_keyword": "Myocardial Infarction",
|
239 |
-
"treatment_keyword": "treatment",
|
240 |
-
"cooccurrence_count": 1369,
|
241 |
-
"percentage": 14.615138251307783
|
242 |
-
},
|
243 |
-
{
|
244 |
-
"emergency_keyword": "Shock",
|
245 |
-
"treatment_keyword": "treatment",
|
246 |
-
"cooccurrence_count": 1340,
|
247 |
-
"percentage": 14.305540728087967
|
248 |
-
},
|
249 |
-
{
|
250 |
-
"emergency_keyword": "Fever",
|
251 |
-
"treatment_keyword": "fluid",
|
252 |
-
"cooccurrence_count": 1330,
|
253 |
-
"percentage": 14.198782961460447
|
254 |
-
},
|
255 |
-
{
|
256 |
-
"emergency_keyword": "Hemorrhage",
|
257 |
-
"treatment_keyword": "treatment",
|
258 |
-
"cooccurrence_count": 1328,
|
259 |
-
"percentage": 14.177431408134941
|
260 |
-
},
|
261 |
-
{
|
262 |
-
"emergency_keyword": "Hypotension",
|
263 |
-
"treatment_keyword": "monitoring",
|
264 |
-
"cooccurrence_count": 1325,
|
265 |
-
"percentage": 14.145404078146683
|
266 |
-
},
|
267 |
-
{
|
268 |
-
"emergency_keyword": "Tachycardia",
|
269 |
-
"treatment_keyword": "Therapy",
|
270 |
-
"cooccurrence_count": 1277,
|
271 |
-
"percentage": 13.632966798334579
|
272 |
-
},
|
273 |
-
{
|
274 |
-
"emergency_keyword": "Dyspnea",
|
275 |
-
"treatment_keyword": "treatment",
|
276 |
-
"cooccurrence_count": 1228,
|
277 |
-
"percentage": 13.10985374185972
|
278 |
-
},
|
279 |
-
{
|
280 |
-
"emergency_keyword": "Myocardial Infarction",
|
281 |
-
"treatment_keyword": "Therapy",
|
282 |
-
"cooccurrence_count": 1215,
|
283 |
-
"percentage": 12.97106864524394
|
284 |
-
}
|
285 |
-
],
|
286 |
-
"path_b_validation": {
|
287 |
-
"avg_emergency_density": 0.3098621434407273,
|
288 |
-
"avg_treatment_density": 0.6108515041451529,
|
289 |
-
"high_density_records": 1298,
|
290 |
-
"precision_estimate": 0.9995729689334899
|
291 |
-
},
|
292 |
-
"condition_mapping_candidates": {}
|
293 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
dataset/scripts/data_explorer_treatment.py
CHANGED
@@ -2,7 +2,7 @@
|
|
2 |
import pandas as pd
|
3 |
import matplotlib.pyplot as plt
|
4 |
import seaborn as sns
|
5 |
-
import numpy
|
6 |
from pathlib import Path
|
7 |
import json
|
8 |
from tqdm import tqdm
|
|
|
2 |
import pandas as pd
|
3 |
import matplotlib.pyplot as plt
|
4 |
import seaborn as sns
|
5 |
+
# Removed duplicate import of numpy
|
6 |
from pathlib import Path
|
7 |
import json
|
8 |
from tqdm import tqdm
|
requirements.txt
CHANGED
@@ -64,6 +64,7 @@ safehttpx==0.1.6
|
|
64 |
safetensors==0.5.3
|
65 |
seaborn==0.13.2
|
66 |
semantic-version==2.10.0
|
|
|
67 |
shellingham==1.5.4
|
68 |
six==1.17.0
|
69 |
sniffio==1.3.1
|
|
|
64 |
safetensors==0.5.3
|
65 |
seaborn==0.13.2
|
66 |
semantic-version==2.10.0
|
67 |
+
sentence-transformers==3.0.1
|
68 |
shellingham==1.5.4
|
69 |
six==1.17.0
|
70 |
sniffio==1.3.1
|
src/__init__.py
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
OnCall.ai src package
|
3 |
+
|
4 |
+
This package contains the core implementation of the OnCall.ai system.
|
5 |
+
"""
|
6 |
+
|
7 |
+
# Version
|
8 |
+
__version__ = '0.1.0'
|
src/data_processing.py
ADDED
@@ -0,0 +1,698 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
OnCall.ai Data Processing Module
|
3 |
+
|
4 |
+
This module handles:
|
5 |
+
1. Loading filtered medical guideline data
|
6 |
+
2. Creating intelligent chunks based on matched keywords
|
7 |
+
3. Generating embeddings using NeuML/pubmedbert-base-embeddings
|
8 |
+
4. Building ANNOY indices for vector search
|
9 |
+
5. Data quality validation
|
10 |
+
|
11 |
+
Author: OnCall.ai Team
|
12 |
+
Date: 2025-07-26
|
13 |
+
"""
|
14 |
+
|
15 |
+
# Required imports for core functionality
|
16 |
+
import json
|
17 |
+
import pandas as pd
|
18 |
+
import numpy as np
|
19 |
+
from pathlib import Path
|
20 |
+
from typing import List, Dict, Tuple, Any
|
21 |
+
from sentence_transformers import SentenceTransformer
|
22 |
+
from annoy import AnnoyIndex
|
23 |
+
import logging
|
24 |
+
from tqdm import tqdm
|
25 |
+
|
26 |
+
# Setup logging
|
27 |
+
logging.basicConfig(
|
28 |
+
level=logging.INFO, # change between INFO and DEBUG level
|
29 |
+
format='%(levelname)s:%(name)s:%(message)s'
|
30 |
+
)
|
31 |
+
logger = logging.getLogger(__name__)
|
32 |
+
|
33 |
+
# Explicitly define what should be exported
|
34 |
+
__all__ = ['DataProcessor']
|
35 |
+
|
36 |
+
class DataProcessor:
|
37 |
+
"""Main data processing class for OnCall.ai RAG system"""
|
38 |
+
|
39 |
+
def __init__(self, base_dir: str = None):
|
40 |
+
"""
|
41 |
+
Initialize DataProcessor
|
42 |
+
|
43 |
+
Args:
|
44 |
+
base_dir: Base directory path for the project
|
45 |
+
"""
|
46 |
+
self.base_dir = Path(base_dir).resolve() if base_dir else Path(__file__).parent.parent.resolve()
|
47 |
+
self.dataset_dir = (self.base_dir / "dataset" / "dataset").resolve() # modify to actual dataset directory
|
48 |
+
self.models_dir = (self.base_dir / "models").resolve()
|
49 |
+
|
50 |
+
# Model configuration
|
51 |
+
self.embedding_model_name = "NeuML/pubmedbert-base-embeddings"
|
52 |
+
self.embedding_dim = 768 # PubMedBERT dimension
|
53 |
+
self.chunk_size = 256 # Changed to tokens instead of characters
|
54 |
+
self.chunk_overlap = 64 # Added overlap configuration
|
55 |
+
|
56 |
+
# Initialize model and tokenizer (will be loaded when needed)
|
57 |
+
self.embedding_model = None
|
58 |
+
self.tokenizer = None
|
59 |
+
|
60 |
+
# Data containers
|
61 |
+
self.emergency_data = None
|
62 |
+
self.treatment_data = None
|
63 |
+
self.emergency_chunks = []
|
64 |
+
self.treatment_chunks = []
|
65 |
+
|
66 |
+
# Initialize indices
|
67 |
+
self.emergency_index = None
|
68 |
+
self.treatment_index = None
|
69 |
+
|
70 |
+
logger.info(f"Initialized DataProcessor with:")
|
71 |
+
logger.info(f" Base directory: {self.base_dir}")
|
72 |
+
logger.info(f" Dataset directory: {self.dataset_dir}")
|
73 |
+
logger.info(f" Models directory: {self.models_dir}")
|
74 |
+
logger.info(f" Chunk size (tokens): {self.chunk_size}")
|
75 |
+
logger.info(f" Chunk overlap (tokens): {self.chunk_overlap}")
|
76 |
+
|
77 |
+
def load_embedding_model(self):
|
78 |
+
"""Load the embedding model and initialize tokenizer"""
|
79 |
+
if self.embedding_model is None:
|
80 |
+
logger.info(f"Loading embedding model: {self.embedding_model_name}")
|
81 |
+
self.embedding_model = SentenceTransformer(self.embedding_model_name)
|
82 |
+
self.tokenizer = self.embedding_model.tokenizer
|
83 |
+
logger.info("Embedding model and tokenizer loaded successfully")
|
84 |
+
return self.embedding_model
|
85 |
+
|
86 |
+
def load_filtered_data(self) -> Tuple[pd.DataFrame, pd.DataFrame]:
|
87 |
+
"""
|
88 |
+
Load pre-filtered emergency and treatment data
|
89 |
+
|
90 |
+
Returns:
|
91 |
+
Tuple of (emergency_data, treatment_data) DataFrames
|
92 |
+
"""
|
93 |
+
logger.info("Loading filtered medical data...")
|
94 |
+
|
95 |
+
# File paths
|
96 |
+
emergency_path = (self.dataset_dir / "emergency" / "emergency_subset_opt.jsonl").resolve()
|
97 |
+
treatment_path = (self.dataset_dir / "emergency_treatment" / "emergency_treatment_subset_opt.jsonl").resolve()
|
98 |
+
|
99 |
+
logger.info(f"Looking for emergency data at: {emergency_path}")
|
100 |
+
logger.info(f"Looking for treatment data at: {treatment_path}")
|
101 |
+
|
102 |
+
# Validate file existence
|
103 |
+
if not emergency_path.exists():
|
104 |
+
raise FileNotFoundError(f"Emergency data not found: {emergency_path}")
|
105 |
+
if not treatment_path.exists():
|
106 |
+
raise FileNotFoundError(f"Treatment data not found: {treatment_path}")
|
107 |
+
|
108 |
+
# Load data
|
109 |
+
self.emergency_data = pd.read_json(str(emergency_path), lines=True) # 使用 str() 确保路径正确处理
|
110 |
+
self.treatment_data = pd.read_json(str(treatment_path), lines=True)
|
111 |
+
|
112 |
+
logger.info(f"Loaded {len(self.emergency_data)} emergency records")
|
113 |
+
logger.info(f"Loaded {len(self.treatment_data)} treatment records")
|
114 |
+
|
115 |
+
return self.emergency_data, self.treatment_data
|
116 |
+
|
117 |
+
def create_keyword_centered_chunks(self, text: str, matched_keywords: str,
|
118 |
+
chunk_size: int = None, doc_id: str = None) -> List[Dict[str, Any]]:
|
119 |
+
"""
|
120 |
+
Create chunks centered around matched keywords using tokenizer
|
121 |
+
|
122 |
+
Args:
|
123 |
+
text: Input text
|
124 |
+
matched_keywords: Pipe-separated keywords (e.g., "MI|chest pain|fever")
|
125 |
+
chunk_size: Size of each chunk in tokens (defaults to self.chunk_size)
|
126 |
+
doc_id: Document ID for tracking
|
127 |
+
|
128 |
+
Returns:
|
129 |
+
List of chunk dictionaries
|
130 |
+
"""
|
131 |
+
if not matched_keywords or pd.isna(matched_keywords):
|
132 |
+
return []
|
133 |
+
|
134 |
+
# Load model if not loaded (to get tokenizer)
|
135 |
+
if self.tokenizer is None:
|
136 |
+
self.load_embedding_model()
|
137 |
+
|
138 |
+
# Convert text and keywords to lowercase at the start
|
139 |
+
text = text.lower()
|
140 |
+
keywords = [kw.lower() for kw in matched_keywords.split("|")] if matched_keywords else []
|
141 |
+
|
142 |
+
chunk_size = chunk_size or self.chunk_size
|
143 |
+
chunks = []
|
144 |
+
|
145 |
+
# Calculate character-to-token ratio using a sample around the first keyword
|
146 |
+
if keywords:
|
147 |
+
first_keyword = keywords[0]
|
148 |
+
first_pos = text.find(first_keyword)
|
149 |
+
if first_pos != -1:
|
150 |
+
# Take a sample around the first keyword for ratio calculation
|
151 |
+
sample_start = max(0, first_pos - 100)
|
152 |
+
sample_end = min(len(text), first_pos + len(first_keyword) + 100)
|
153 |
+
sample_text = text[sample_start:sample_end]
|
154 |
+
sample_tokens = len(self.tokenizer.tokenize(sample_text))
|
155 |
+
chars_per_token = len(sample_text) / sample_tokens if sample_tokens > 0 else 4.0
|
156 |
+
else:
|
157 |
+
chars_per_token = 4.0 # Fallback ratio
|
158 |
+
else:
|
159 |
+
chars_per_token = 4.0 # Default ratio
|
160 |
+
|
161 |
+
# Process keywords
|
162 |
+
for i, keyword in enumerate(keywords):
|
163 |
+
# Find keyword position in text (already lowercase)
|
164 |
+
keyword_pos = text.find(keyword)
|
165 |
+
|
166 |
+
if keyword_pos != -1:
|
167 |
+
# Get the keyword text (already lowercase)
|
168 |
+
actual_keyword = text[keyword_pos:keyword_pos + len(keyword)]
|
169 |
+
|
170 |
+
# Calculate rough window size using dynamic ratio
|
171 |
+
# Cap the rough chunk target token size to prevent tokenizer warnings
|
172 |
+
# Use 512 tokens as target (model's max limit)
|
173 |
+
ROUGH_CHUNK_TARGET_TOKENS = 512
|
174 |
+
char_window = int(ROUGH_CHUNK_TARGET_TOKENS * chars_per_token / 2)
|
175 |
+
|
176 |
+
# Get rough chunk boundaries in characters
|
177 |
+
rough_start = max(0, keyword_pos - char_window)
|
178 |
+
rough_end = min(len(text), keyword_pos + len(keyword) + char_window)
|
179 |
+
|
180 |
+
# Extract rough chunk for processing
|
181 |
+
rough_chunk = text[rough_start:rough_end]
|
182 |
+
|
183 |
+
# Find keyword's relative position in rough chunk
|
184 |
+
rel_pos = rough_chunk.find(actual_keyword)
|
185 |
+
if rel_pos == -1:
|
186 |
+
logger.debug(f"Could not locate keyword '{actual_keyword}' in rough chunk for doc {doc_id}")
|
187 |
+
continue
|
188 |
+
|
189 |
+
# Calculate token position by tokenizing text before keyword
|
190 |
+
text_before = rough_chunk[:rel_pos]
|
191 |
+
tokens_before = self.tokenizer.tokenize(text_before)
|
192 |
+
keyword_start_pos = len(tokens_before)
|
193 |
+
|
194 |
+
# Tokenize necessary parts
|
195 |
+
chunk_tokens = self.tokenizer.tokenize(rough_chunk)
|
196 |
+
keyword_tokens = self.tokenizer.tokenize(actual_keyword)
|
197 |
+
keyword_length = len(keyword_tokens)
|
198 |
+
|
199 |
+
# Calculate final chunk boundaries in tokens
|
200 |
+
tokens_each_side = (chunk_size - keyword_length) // 2
|
201 |
+
chunk_start = max(0, keyword_start_pos - tokens_each_side)
|
202 |
+
chunk_end = min(len(chunk_tokens), keyword_start_pos + keyword_length + tokens_each_side)
|
203 |
+
|
204 |
+
# Add overlap if possible
|
205 |
+
if chunk_start > 0:
|
206 |
+
chunk_start = max(0, chunk_start - self.chunk_overlap)
|
207 |
+
if chunk_end < len(chunk_tokens):
|
208 |
+
chunk_end = min(len(chunk_tokens), chunk_end + self.chunk_overlap)
|
209 |
+
|
210 |
+
# Extract final tokens and convert to text
|
211 |
+
final_tokens = chunk_tokens[chunk_start:chunk_end]
|
212 |
+
chunk_text = self.tokenizer.convert_tokens_to_string(final_tokens)
|
213 |
+
|
214 |
+
# Verify keyword presence in final chunk
|
215 |
+
if chunk_text and actual_keyword in chunk_text:
|
216 |
+
chunk_info = {
|
217 |
+
"text": chunk_text,
|
218 |
+
"primary_keyword": actual_keyword,
|
219 |
+
"all_matched_keywords": matched_keywords.lower(),
|
220 |
+
"token_count": len(final_tokens),
|
221 |
+
"chunk_id": f"{doc_id}_chunk_{i}" if doc_id else f"chunk_{i}",
|
222 |
+
"source_doc_id": doc_id
|
223 |
+
}
|
224 |
+
chunks.append(chunk_info)
|
225 |
+
else:
|
226 |
+
logger.debug(f"Could not create chunk for keyword '{actual_keyword}' in doc {doc_id}")
|
227 |
+
|
228 |
+
if chunks:
|
229 |
+
logger.debug(f"Created {len(chunks)} chunks for document {doc_id or 'unknown'}")
|
230 |
+
|
231 |
+
return chunks
|
232 |
+
|
233 |
+
def create_dual_keyword_chunks(self, text: str, emergency_keywords: str,
|
234 |
+
treatment_keywords: str, chunk_size: int = 512,
|
235 |
+
doc_id: str = None) -> List[Dict[str, Any]]:
|
236 |
+
"""
|
237 |
+
Create chunks for treatment data with both emergency and treatment keywords
|
238 |
+
|
239 |
+
Args:
|
240 |
+
text: Input text
|
241 |
+
emergency_keywords: Emergency keywords
|
242 |
+
treatment_keywords: Treatment keywords
|
243 |
+
chunk_size: Size of each chunk
|
244 |
+
doc_id: Document ID for tracking
|
245 |
+
|
246 |
+
Returns:
|
247 |
+
List of chunk dictionaries
|
248 |
+
"""
|
249 |
+
if not treatment_keywords or pd.isna(treatment_keywords):
|
250 |
+
return []
|
251 |
+
|
252 |
+
chunks = []
|
253 |
+
em_keywords = emergency_keywords.split("|") if emergency_keywords else []
|
254 |
+
tr_keywords = treatment_keywords.split("|") if treatment_keywords else []
|
255 |
+
|
256 |
+
# Process treatment keywords as primary (since this is treatment-focused data)
|
257 |
+
for i, tr_keyword in enumerate(tr_keywords):
|
258 |
+
tr_pos = text.lower().find(tr_keyword.lower())
|
259 |
+
|
260 |
+
if tr_pos != -1:
|
261 |
+
# Find closest emergency keyword for context
|
262 |
+
closest_em_keyword = None
|
263 |
+
closest_distance = float('inf')
|
264 |
+
|
265 |
+
for em_keyword in em_keywords:
|
266 |
+
em_pos = text.lower().find(em_keyword.lower())
|
267 |
+
if em_pos != -1:
|
268 |
+
distance = abs(tr_pos - em_pos)
|
269 |
+
if distance < closest_distance and distance < chunk_size:
|
270 |
+
closest_distance = distance
|
271 |
+
closest_em_keyword = em_keyword
|
272 |
+
|
273 |
+
# Calculate chunk boundaries
|
274 |
+
if closest_em_keyword:
|
275 |
+
# Center between both keywords
|
276 |
+
em_pos = text.lower().find(closest_em_keyword.lower())
|
277 |
+
center = (tr_pos + em_pos) // 2
|
278 |
+
else:
|
279 |
+
# Center on treatment keyword
|
280 |
+
center = tr_pos
|
281 |
+
|
282 |
+
start = max(0, center - chunk_size // 2)
|
283 |
+
end = min(len(text), center + chunk_size // 2)
|
284 |
+
|
285 |
+
chunk_text = text[start:end].strip()
|
286 |
+
|
287 |
+
if chunk_text:
|
288 |
+
chunk_info = {
|
289 |
+
"text": chunk_text,
|
290 |
+
"primary_keyword": tr_keyword,
|
291 |
+
"emergency_keywords": emergency_keywords,
|
292 |
+
"treatment_keywords": treatment_keywords,
|
293 |
+
"closest_emergency_keyword": closest_em_keyword,
|
294 |
+
"keyword_distance": closest_distance if closest_em_keyword else None,
|
295 |
+
"chunk_start": start,
|
296 |
+
"chunk_end": end,
|
297 |
+
"chunk_id": f"{doc_id}_treatment_chunk_{i}" if doc_id else f"treatment_chunk_{i}",
|
298 |
+
"source_doc_id": doc_id
|
299 |
+
}
|
300 |
+
chunks.append(chunk_info)
|
301 |
+
|
302 |
+
return chunks
|
303 |
+
|
304 |
+
def process_emergency_chunks(self) -> List[Dict[str, Any]]:
|
305 |
+
"""Process emergency data into chunks"""
|
306 |
+
if self.emergency_data is None:
|
307 |
+
raise ValueError("Emergency data not loaded. Call load_filtered_data() first.")
|
308 |
+
|
309 |
+
all_chunks = []
|
310 |
+
|
311 |
+
# Add progress bar with leave=False to avoid cluttering
|
312 |
+
for idx, row in tqdm(self.emergency_data.iterrows(),
|
313 |
+
total=len(self.emergency_data),
|
314 |
+
desc="Processing emergency documents",
|
315 |
+
unit="doc",
|
316 |
+
leave=False):
|
317 |
+
if pd.notna(row.get('clean_text')) and pd.notna(row.get('matched')):
|
318 |
+
chunks = self.create_keyword_centered_chunks(
|
319 |
+
text=row['clean_text'],
|
320 |
+
matched_keywords=row['matched'],
|
321 |
+
chunk_size=self.chunk_size,
|
322 |
+
doc_id=str(row.get('id', idx))
|
323 |
+
)
|
324 |
+
|
325 |
+
# Add metadata to each chunk
|
326 |
+
for chunk in chunks:
|
327 |
+
chunk.update({
|
328 |
+
'source_type': 'emergency',
|
329 |
+
'source_title': row.get('title', ''),
|
330 |
+
'source_url': row.get('url', ''),
|
331 |
+
'has_emergency': row.get('has_emergency', True),
|
332 |
+
'doc_type': row.get('type', 'emergency')
|
333 |
+
})
|
334 |
+
|
335 |
+
all_chunks.extend(chunks)
|
336 |
+
|
337 |
+
self.emergency_chunks = all_chunks
|
338 |
+
logger.info(f"Completed processing emergency data: {len(all_chunks)} chunks generated")
|
339 |
+
return all_chunks
|
340 |
+
|
341 |
+
def process_treatment_chunks(self) -> List[Dict[str, Any]]:
|
342 |
+
"""Process treatment data into chunks"""
|
343 |
+
if self.treatment_data is None:
|
344 |
+
raise ValueError("Treatment data not loaded. Call load_filtered_data() first.")
|
345 |
+
|
346 |
+
all_chunks = []
|
347 |
+
|
348 |
+
# Add progress bar with leave=False to avoid cluttering
|
349 |
+
for idx, row in tqdm(self.treatment_data.iterrows(),
|
350 |
+
total=len(self.treatment_data),
|
351 |
+
desc="Processing treatment documents",
|
352 |
+
unit="doc",
|
353 |
+
leave=False):
|
354 |
+
if (pd.notna(row.get('clean_text')) and
|
355 |
+
pd.notna(row.get('treatment_matched'))):
|
356 |
+
|
357 |
+
chunks = self.create_dual_keyword_chunks(
|
358 |
+
text=row['clean_text'],
|
359 |
+
emergency_keywords=row.get('matched', ''),
|
360 |
+
treatment_keywords=row['treatment_matched'],
|
361 |
+
chunk_size=self.chunk_size,
|
362 |
+
doc_id=str(row.get('id', idx))
|
363 |
+
)
|
364 |
+
|
365 |
+
# Add metadata to each chunk
|
366 |
+
for chunk in chunks:
|
367 |
+
chunk.update({
|
368 |
+
'source_type': 'treatment',
|
369 |
+
'source_title': row.get('title', ''),
|
370 |
+
'source_url': row.get('url', ''),
|
371 |
+
'has_emergency': row.get('has_emergency', True),
|
372 |
+
'has_treatment': row.get('has_treatment', True),
|
373 |
+
'doc_type': row.get('type', 'treatment')
|
374 |
+
})
|
375 |
+
|
376 |
+
all_chunks.extend(chunks)
|
377 |
+
|
378 |
+
self.treatment_chunks = all_chunks
|
379 |
+
logger.info(f"Completed processing treatment data: {len(all_chunks)} chunks generated")
|
380 |
+
return all_chunks
|
381 |
+
|
382 |
+
def _get_chunk_hash(self, text: str) -> str:
|
383 |
+
"""Generate hash for chunk text to use as cache key"""
|
384 |
+
import hashlib
|
385 |
+
return hashlib.md5(text.encode('utf-8')).hexdigest()
|
386 |
+
|
387 |
+
def _load_embedding_cache(self, cache_file: str) -> dict:
|
388 |
+
"""Load embedding cache from file"""
|
389 |
+
import pickle
|
390 |
+
import os
|
391 |
+
if os.path.exists(cache_file):
|
392 |
+
try:
|
393 |
+
with open(cache_file, 'rb') as f:
|
394 |
+
return pickle.load(f)
|
395 |
+
except:
|
396 |
+
logger.warning(f"Could not load cache file {cache_file}, starting fresh")
|
397 |
+
return {}
|
398 |
+
return {}
|
399 |
+
|
400 |
+
def _save_embedding_cache(self, cache: dict, cache_file: str):
|
401 |
+
"""Save embedding cache to file"""
|
402 |
+
import pickle
|
403 |
+
import os
|
404 |
+
os.makedirs(os.path.dirname(cache_file), exist_ok=True)
|
405 |
+
with open(cache_file, 'wb') as f:
|
406 |
+
pickle.dump(cache, f)
|
407 |
+
|
408 |
+
def generate_embeddings(self, chunks: List[Dict[str, Any]],
|
409 |
+
chunk_type: str = "emergency") -> np.ndarray:
|
410 |
+
"""
|
411 |
+
Generate embeddings for chunks with caching support
|
412 |
+
|
413 |
+
Args:
|
414 |
+
chunks: List of chunk dictionaries
|
415 |
+
chunk_type: Type of chunks ("emergency" or "treatment")
|
416 |
+
|
417 |
+
Returns:
|
418 |
+
numpy array of embeddings
|
419 |
+
"""
|
420 |
+
logger.info(f"Starting embedding generation for {len(chunks)} {chunk_type} chunks...")
|
421 |
+
|
422 |
+
# Cache setup
|
423 |
+
cache_dir = self.models_dir / "cache"
|
424 |
+
cache_dir.mkdir(parents=True, exist_ok=True)
|
425 |
+
cache_file = cache_dir / f"{chunk_type}_embeddings_cache.pkl"
|
426 |
+
|
427 |
+
# Load existing cache
|
428 |
+
cache = self._load_embedding_cache(str(cache_file))
|
429 |
+
|
430 |
+
cached_embeddings = []
|
431 |
+
to_embed = []
|
432 |
+
|
433 |
+
# Check cache for each chunk
|
434 |
+
for i, chunk in enumerate(chunks):
|
435 |
+
chunk_hash = self._get_chunk_hash(chunk['text'])
|
436 |
+
if chunk_hash in cache:
|
437 |
+
cached_embeddings.append((i, cache[chunk_hash]))
|
438 |
+
else:
|
439 |
+
to_embed.append((i, chunk_hash, chunk['text']))
|
440 |
+
|
441 |
+
logger.info(f"Cache status: {len(cached_embeddings)} cached, {len(to_embed)} new chunks to embed")
|
442 |
+
|
443 |
+
# Generate embeddings for new chunks
|
444 |
+
new_embeddings = []
|
445 |
+
if to_embed:
|
446 |
+
# Load model
|
447 |
+
model = self.load_embedding_model()
|
448 |
+
texts = [text for _, _, text in to_embed]
|
449 |
+
|
450 |
+
# Generate embeddings in batches with clear progress
|
451 |
+
batch_size = 32
|
452 |
+
total_batches = (len(texts) + batch_size - 1) // batch_size
|
453 |
+
|
454 |
+
logger.info(f"Processing {len(texts)} new {chunk_type} texts in {total_batches} batches...")
|
455 |
+
|
456 |
+
for i in tqdm(range(0, len(texts), batch_size),
|
457 |
+
desc=f"Embedding {chunk_type} subset",
|
458 |
+
total=total_batches,
|
459 |
+
unit="batch",
|
460 |
+
leave=False):
|
461 |
+
batch_texts = texts[i:i + batch_size]
|
462 |
+
batch_emb = model.encode(
|
463 |
+
batch_texts,
|
464 |
+
show_progress_bar=False
|
465 |
+
)
|
466 |
+
new_embeddings.extend(batch_emb)
|
467 |
+
|
468 |
+
# Update cache with new embeddings
|
469 |
+
for (_, chunk_hash, _), emb in zip(to_embed, new_embeddings):
|
470 |
+
cache[chunk_hash] = emb
|
471 |
+
|
472 |
+
# Save updated cache
|
473 |
+
self._save_embedding_cache(cache, str(cache_file))
|
474 |
+
logger.info(f"Updated cache with {len(new_embeddings)} new embeddings")
|
475 |
+
|
476 |
+
# Combine cached and new embeddings in correct order
|
477 |
+
all_embeddings = [None] * len(chunks)
|
478 |
+
|
479 |
+
# Place cached embeddings
|
480 |
+
for idx, emb in cached_embeddings:
|
481 |
+
all_embeddings[idx] = emb
|
482 |
+
|
483 |
+
# Place new embeddings
|
484 |
+
for (idx, _, _), emb in zip(to_embed, new_embeddings):
|
485 |
+
all_embeddings[idx] = emb
|
486 |
+
|
487 |
+
# Convert to numpy array
|
488 |
+
result = np.vstack(all_embeddings)
|
489 |
+
logger.info(f"Completed embedding generation: shape {result.shape}")
|
490 |
+
|
491 |
+
return result
|
492 |
+
|
493 |
+
def build_annoy_index(self, embeddings: np.ndarray,
|
494 |
+
index_name: str, n_trees: int = 15) -> AnnoyIndex:
|
495 |
+
"""
|
496 |
+
Build ANNOY index from embeddings
|
497 |
+
|
498 |
+
Args:
|
499 |
+
embeddings: Numpy array of embeddings
|
500 |
+
index_name: Name for the index file
|
501 |
+
n_trees: Number of trees for ANNOY index
|
502 |
+
|
503 |
+
Returns:
|
504 |
+
Built ANNOY index
|
505 |
+
"""
|
506 |
+
logger.info(f"Building ANNOY index: {index_name}")
|
507 |
+
|
508 |
+
# Create ANNOY index
|
509 |
+
index = AnnoyIndex(self.embedding_dim, 'angular') # angular = cosine similarity
|
510 |
+
|
511 |
+
# Add vectors to index
|
512 |
+
for i, embedding in enumerate(embeddings):
|
513 |
+
index.add_item(i, embedding)
|
514 |
+
|
515 |
+
# Build index
|
516 |
+
index.build(n_trees)
|
517 |
+
|
518 |
+
# Save index
|
519 |
+
index_path = self.models_dir / "indices" / "annoy" / f"{index_name}.ann"
|
520 |
+
index_path.parent.mkdir(parents=True, exist_ok=True)
|
521 |
+
index.save(str(index_path))
|
522 |
+
|
523 |
+
logger.info(f"ANNOY index saved to: {index_path}")
|
524 |
+
return index
|
525 |
+
|
526 |
+
def save_chunks_and_embeddings(self, chunks: List[Dict[str, Any]],
|
527 |
+
embeddings: np.ndarray, chunk_type: str):
|
528 |
+
"""
|
529 |
+
Save chunks metadata and embeddings
|
530 |
+
|
531 |
+
Args:
|
532 |
+
chunks: List of chunk dictionaries
|
533 |
+
embeddings: Numpy array of embeddings
|
534 |
+
chunk_type: Type of chunks ("emergency" or "treatment")
|
535 |
+
"""
|
536 |
+
logger.info(f"Saving {chunk_type} chunks and embeddings...")
|
537 |
+
|
538 |
+
# Create output directories
|
539 |
+
embeddings_dir = self.models_dir / "embeddings"
|
540 |
+
embeddings_dir.mkdir(parents=True, exist_ok=True)
|
541 |
+
|
542 |
+
# Save chunks metadata
|
543 |
+
chunks_file = embeddings_dir / f"{chunk_type}_chunks.json"
|
544 |
+
with open(chunks_file, 'w', encoding='utf-8') as f:
|
545 |
+
json.dump(chunks, f, ensure_ascii=False, indent=2)
|
546 |
+
|
547 |
+
# Save embeddings
|
548 |
+
embeddings_file = embeddings_dir / f"{chunk_type}_embeddings.npy"
|
549 |
+
np.save(embeddings_file, embeddings)
|
550 |
+
|
551 |
+
logger.info(f"Saved {chunk_type} data:")
|
552 |
+
logger.info(f" - Chunks: {chunks_file}")
|
553 |
+
logger.info(f" - Embeddings: {embeddings_file}")
|
554 |
+
|
555 |
+
def validate_data_quality(self) -> Dict[str, Any]:
|
556 |
+
"""
|
557 |
+
Validate data quality and return statistics
|
558 |
+
|
559 |
+
Returns:
|
560 |
+
Dictionary with validation statistics
|
561 |
+
"""
|
562 |
+
logger.info("Validating data quality...")
|
563 |
+
|
564 |
+
validation_report = {
|
565 |
+
"emergency_data": {},
|
566 |
+
"treatment_data": {},
|
567 |
+
"chunks": {},
|
568 |
+
"embeddings": {}
|
569 |
+
}
|
570 |
+
|
571 |
+
# Emergency data validation
|
572 |
+
if self.emergency_data is not None:
|
573 |
+
validation_report["emergency_data"] = {
|
574 |
+
"total_records": len(self.emergency_data),
|
575 |
+
"records_with_text": self.emergency_data['clean_text'].notna().sum(),
|
576 |
+
"records_with_keywords": self.emergency_data['matched'].notna().sum(),
|
577 |
+
"avg_text_length": self.emergency_data['clean_text'].str.len().mean()
|
578 |
+
}
|
579 |
+
|
580 |
+
# Treatment data validation
|
581 |
+
if self.treatment_data is not None:
|
582 |
+
validation_report["treatment_data"] = {
|
583 |
+
"total_records": len(self.treatment_data),
|
584 |
+
"records_with_text": self.treatment_data['clean_text'].notna().sum(),
|
585 |
+
"records_with_emergency_keywords": self.treatment_data['matched'].notna().sum(),
|
586 |
+
"records_with_treatment_keywords": self.treatment_data['treatment_matched'].notna().sum(),
|
587 |
+
"avg_text_length": self.treatment_data['clean_text'].str.len().mean()
|
588 |
+
}
|
589 |
+
|
590 |
+
# Chunks validation
|
591 |
+
validation_report["chunks"] = {
|
592 |
+
"emergency_chunks": len(self.emergency_chunks),
|
593 |
+
"treatment_chunks": len(self.treatment_chunks),
|
594 |
+
"total_chunks": len(self.emergency_chunks) + len(self.treatment_chunks)
|
595 |
+
}
|
596 |
+
|
597 |
+
if self.emergency_chunks:
|
598 |
+
avg_chunk_length = np.mean([len(chunk['text']) for chunk in self.emergency_chunks])
|
599 |
+
validation_report["chunks"]["avg_emergency_chunk_length"] = avg_chunk_length
|
600 |
+
|
601 |
+
if self.treatment_chunks:
|
602 |
+
avg_chunk_length = np.mean([len(chunk['text']) for chunk in self.treatment_chunks])
|
603 |
+
validation_report["chunks"]["avg_treatment_chunk_length"] = avg_chunk_length
|
604 |
+
|
605 |
+
# Check if embeddings exist
|
606 |
+
embeddings_dir = self.models_dir / "embeddings"
|
607 |
+
if embeddings_dir.exists():
|
608 |
+
emergency_emb_file = embeddings_dir / "emergency_embeddings.npy"
|
609 |
+
treatment_emb_file = embeddings_dir / "treatment_embeddings.npy"
|
610 |
+
|
611 |
+
validation_report["embeddings"] = {
|
612 |
+
"emergency_embeddings_exist": emergency_emb_file.exists(),
|
613 |
+
"treatment_embeddings_exist": treatment_emb_file.exists()
|
614 |
+
}
|
615 |
+
|
616 |
+
if emergency_emb_file.exists():
|
617 |
+
emb = np.load(emergency_emb_file)
|
618 |
+
validation_report["embeddings"]["emergency_embeddings_shape"] = emb.shape
|
619 |
+
|
620 |
+
if treatment_emb_file.exists():
|
621 |
+
emb = np.load(treatment_emb_file)
|
622 |
+
validation_report["embeddings"]["treatment_embeddings_shape"] = emb.shape
|
623 |
+
|
624 |
+
# Save validation report
|
625 |
+
report_file = self.models_dir / "data_validation_report.json"
|
626 |
+
with open(report_file, 'w', encoding='utf-8') as f:
|
627 |
+
json.dump(validation_report, f, indent=2, default=str)
|
628 |
+
|
629 |
+
logger.info(f"Validation report saved to: {report_file}")
|
630 |
+
return validation_report
|
631 |
+
|
632 |
+
def process_all_data(self) -> Dict[str, Any]:
|
633 |
+
"""
|
634 |
+
Complete data processing pipeline
|
635 |
+
|
636 |
+
Returns:
|
637 |
+
Processing summary
|
638 |
+
"""
|
639 |
+
logger.info("Starting complete data processing pipeline...")
|
640 |
+
|
641 |
+
# Step 1: Load filtered data
|
642 |
+
self.load_filtered_data()
|
643 |
+
|
644 |
+
# Step 2: Process chunks
|
645 |
+
emergency_chunks = self.process_emergency_chunks()
|
646 |
+
treatment_chunks = self.process_treatment_chunks()
|
647 |
+
|
648 |
+
# Step 3: Generate embeddings
|
649 |
+
emergency_embeddings = self.generate_embeddings(emergency_chunks, "emergency")
|
650 |
+
treatment_embeddings = self.generate_embeddings(treatment_chunks, "treatment")
|
651 |
+
|
652 |
+
# Step 4: Build ANNOY indices
|
653 |
+
self.emergency_index = self.build_annoy_index(emergency_embeddings, "emergency_index")
|
654 |
+
self.treatment_index = self.build_annoy_index(treatment_embeddings, "treatment_index")
|
655 |
+
|
656 |
+
# Step 5: Save data
|
657 |
+
self.save_chunks_and_embeddings(emergency_chunks, emergency_embeddings, "emergency")
|
658 |
+
self.save_chunks_and_embeddings(treatment_chunks, treatment_embeddings, "treatment")
|
659 |
+
|
660 |
+
# Step 6: Validate data quality
|
661 |
+
validation_report = self.validate_data_quality()
|
662 |
+
|
663 |
+
# Summary
|
664 |
+
summary = {
|
665 |
+
"status": "completed",
|
666 |
+
"emergency_chunks": len(emergency_chunks),
|
667 |
+
"treatment_chunks": len(treatment_chunks),
|
668 |
+
"emergency_embeddings_shape": emergency_embeddings.shape,
|
669 |
+
"treatment_embeddings_shape": treatment_embeddings.shape,
|
670 |
+
"indices_created": ["emergency_index.ann", "treatment_index.ann"],
|
671 |
+
"validation_report": validation_report
|
672 |
+
}
|
673 |
+
|
674 |
+
logger.info("Data processing pipeline completed successfully!")
|
675 |
+
logger.info(f"Summary: {summary}")
|
676 |
+
|
677 |
+
return summary
|
678 |
+
|
679 |
+
def main():
|
680 |
+
"""Main function for testing the data processor"""
|
681 |
+
# Initialize processor
|
682 |
+
processor = DataProcessor()
|
683 |
+
|
684 |
+
# Run complete pipeline
|
685 |
+
summary = processor.process_all_data()
|
686 |
+
|
687 |
+
print("\n" + "="*50)
|
688 |
+
print("DATA PROCESSING COMPLETED")
|
689 |
+
print("="*50)
|
690 |
+
print(f"Emergency chunks: {summary['emergency_chunks']}")
|
691 |
+
print(f"Treatment chunks: {summary['treatment_chunks']}")
|
692 |
+
print(f"Emergency embeddings: {summary['emergency_embeddings_shape']}")
|
693 |
+
print(f"Treatment embeddings: {summary['treatment_embeddings_shape']}")
|
694 |
+
print(f"Indices created: {summary['indices_created']}")
|
695 |
+
print("="*50)
|
696 |
+
|
697 |
+
if __name__ == "__main__":
|
698 |
+
main()
|
tests/embedding_test_analysis.md
ADDED
@@ -0,0 +1,355 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Embedding Test Analysis Report
|
2 |
+
|
3 |
+
## 1. Dataset Overview
|
4 |
+
|
5 |
+
### 1.1 Data Dimensions
|
6 |
+
- Emergency Dataset: 27,493 chunks × 768 dimensions
|
7 |
+
- Treatment Dataset: 82,378 chunks × 768 dimensions
|
8 |
+
- Total Chunks: 109,871
|
9 |
+
|
10 |
+
### 1.2 Embedding Statistics
|
11 |
+
|
12 |
+
**Emergency Embeddings:**
|
13 |
+
- Value Range: -3.246 to 3.480
|
14 |
+
- Mean: -0.017
|
15 |
+
- Standard Deviation: 0.462
|
16 |
+
|
17 |
+
**Treatment Embeddings:**
|
18 |
+
- Value Range: -3.686 to 3.505
|
19 |
+
- Mean: -0.017
|
20 |
+
- Standard Deviation: 0.472
|
21 |
+
|
22 |
+
**Analysis:**
|
23 |
+
- Both datasets have similar statistical properties
|
24 |
+
- Mean values are centered around zero (-0.017)
|
25 |
+
- Standard deviations are comparable (0.462 vs 0.472)
|
26 |
+
- Treatment dataset has slightly wider range (-3.686 to 3.505 vs -3.246 to 3.480)
|
27 |
+
|
28 |
+
## 2. Model Performance
|
29 |
+
|
30 |
+
### 2.1 Self-Retrieval Test
|
31 |
+
- Test Size: 20 random samples
|
32 |
+
- Success Rate: 19/20 (95%)
|
33 |
+
- Failed Case: Index 27418
|
34 |
+
- Average Response Time: ~5ms per search
|
35 |
+
|
36 |
+
**Observations:**
|
37 |
+
- High success rate in self-retrieval (95%)
|
38 |
+
- One failure case needs investigation
|
39 |
+
- Search operations are consistently fast
|
40 |
+
|
41 |
+
### 2.2 Cross-Dataset Search Performance
|
42 |
+
|
43 |
+
**Test Queries:**
|
44 |
+
1. "What is the treatment protocol for acute myocardial infarction?"
|
45 |
+
2. "How to manage severe chest pain with difficulty breathing?"
|
46 |
+
3. "What are the emergency procedures for anaphylactic shock?"
|
47 |
+
|
48 |
+
**Key Findings:**
|
49 |
+
- Each query returns top-5 results from both datasets
|
50 |
+
- Results show semantic understanding (not just keyword matching)
|
51 |
+
- First sentences provide good context for relevance assessment
|
52 |
+
|
53 |
+
## 3. System Performance
|
54 |
+
|
55 |
+
### 3.1 Response Times
|
56 |
+
- Model Loading: ~3 seconds
|
57 |
+
- Embedding Validation: ~0.5 seconds
|
58 |
+
- Search Operations: 0.1-0.2 seconds per query
|
59 |
+
|
60 |
+
### 3.2 Resource Usage
|
61 |
+
- Model loaded on MPS (Metal Performance Shaders)
|
62 |
+
- Efficient memory usage for large datasets
|
63 |
+
- Fast vector operations
|
64 |
+
|
65 |
+
## 4. Recommendations
|
66 |
+
|
67 |
+
### 4.1 Immediate Improvements
|
68 |
+
1. Investigate failed self-retrieval case (index 27418)
|
69 |
+
2. Consider caching frequently accessed embeddings
|
70 |
+
3. Add more diverse test queries
|
71 |
+
|
72 |
+
### 4.2 Future Enhancements
|
73 |
+
1. Implement hybrid search (combine with BM25)
|
74 |
+
2. Add relevance scoring mechanism
|
75 |
+
3. Consider domain-specific test cases
|
76 |
+
|
77 |
+
## 5. Log Analysis
|
78 |
+
|
79 |
+
### 5.1 Log Structure
|
80 |
+
```
|
81 |
+
timestamp - level - message
|
82 |
+
```
|
83 |
+
|
84 |
+
### 5.2 Log Levels Used
|
85 |
+
- DEBUG: Detailed operation info
|
86 |
+
- INFO: General progress and results
|
87 |
+
- WARNING: Non-critical issues
|
88 |
+
- ERROR: Critical failures
|
89 |
+
|
90 |
+
### 5.3 Key Log Categories
|
91 |
+
1. **Initialization Logs:**
|
92 |
+
- Path configurations
|
93 |
+
- Model loading
|
94 |
+
- Dataset loading
|
95 |
+
|
96 |
+
2. **Performance Logs:**
|
97 |
+
- Search operations
|
98 |
+
- Response times
|
99 |
+
- Success/failure counts
|
100 |
+
|
101 |
+
3. **Error Logs:**
|
102 |
+
- Failed searches
|
103 |
+
- Validation errors
|
104 |
+
- Connection issues
|
105 |
+
|
106 |
+
### 5.4 Notable Log Patterns
|
107 |
+
- Regular HTTPS connections to HuggingFace
|
108 |
+
- Consistent search operation timing
|
109 |
+
- Clear error messages for failures
|
110 |
+
|
111 |
+
|
112 |
+
<!-- split -->
|
113 |
+
|
114 |
+
|
115 |
+
# 🧪 Embedding Test Analysis Report | 向量嵌入測試分析報告
|
116 |
+
|
117 |
+
## 1. Dataset Overview | 資料集總覽
|
118 |
+
|
119 |
+
### 1.1 Data Dimensions | 資料維度
|
120 |
+
- **Emergency Dataset**: 27,493 chunks × 768 dimensions
|
121 |
+
- **Treatment Dataset**: 82,378 chunks × 768 dimensions
|
122 |
+
- **Total Chunks**: 109,871
|
123 |
+
|
124 |
+
### 1.2 Embedding Statistics | 向量統計
|
125 |
+
**Emergency Embeddings 緊急資料集嵌入向量:**
|
126 |
+
- Value Range 範圍: -3.246 ~ 3.480
|
127 |
+
- Mean 平均值: -0.017
|
128 |
+
- Std 標準差: 0.462
|
129 |
+
|
130 |
+
**Treatment Embeddings 治療資料集嵌入向量:**
|
131 |
+
- Value Range 範圍: -3.686 ~ 3.505
|
132 |
+
- Mean 平均值: -0.017
|
133 |
+
- Std 標準差: 0.472
|
134 |
+
|
135 |
+
**Analysis 分析:**
|
136 |
+
- 兩組資料集中向量分布接近,平均值均接近 0
|
137 |
+
- Treatment 資料集範圍稍寬,可能涵蓋更廣語意
|
138 |
+
|
139 |
+
---
|
140 |
+
|
141 |
+
## 2. Model Performance | 模型檢索表現
|
142 |
+
|
143 |
+
### 2.1 Self-Retrieval Test | 自我召回測試
|
144 |
+
- 測試數量 Test Size: 20
|
145 |
+
- 成功率 Success Rate: **95% (19/20)**
|
146 |
+
- 失敗案例 Failed Index: `27418`
|
147 |
+
- 平均搜尋時間 Avg Search Time: ~5ms
|
148 |
+
|
149 |
+
**Observation 觀察:**
|
150 |
+
- 自我召回成功率高,顯示索引構建準確
|
151 |
+
- 可進一步針對失敗樣本檢查切 chunk 是否過短
|
152 |
+
|
153 |
+
|
154 |
+
<!-- Details -->
|
155 |
+
|
156 |
+
# 🔍 Embedding Search Analysis Report (Emergency vs Treatment)
|
157 |
+
|
158 |
+
## 📊 Overall Summary
|
159 |
+
|
160 |
+
| Query | Emergency Results | Treatment Results | Summary Comment |
|
161 |
+
|---------------------------------------------------------|------------------------|------------------------|-----------------------------------------------|
|
162 |
+
| 1️⃣ Treatment for Acute Myocardial Infarction | ✅ Matched well | ✅ Highly relevant | Relevant guidelines retrieved from both sets |
|
163 |
+
| 2️⃣ Management of Severe Chest Pain with Dyspnea | ⚠️ Redundant, not focused | ⚠️ Vague and general | Lacks actionable steps, contains repetition |
|
164 |
+
| 3️⃣ Emergency Procedures for Anaphylactic Shock | ⚠️ Off-topic | ✅ Precise and relevant | Emergency off-topic, but Treatment is strong |
|
165 |
+
|
166 |
+
---
|
167 |
+
|
168 |
+
## ��� Detailed Query Analysis
|
169 |
+
|
170 |
+
### ✅ 1. `What is the treatment protocol for acute myocardial infarction?`
|
171 |
+
|
172 |
+
#### 📌 Emergency Dataset:
|
173 |
+
- `E-2 ~ E-4` mention guidelines, STEMI, PCI.
|
174 |
+
- Distances range from `0.833 ~ 0.842` → valid.
|
175 |
+
- `E-3` is a long guideline chunk → ideal RAG candidate.
|
176 |
+
|
177 |
+
✅ Conclusion: Emergency subset performs well, keyword chunking effective.
|
178 |
+
|
179 |
+
#### 📌 Treatment Dataset:
|
180 |
+
- `T-1` and `T-2` directly address the question with guideline phrases.
|
181 |
+
- `distance ~0.813` → strong semantic match.
|
182 |
+
- `T-5` is shorter but still contains “AMI”.
|
183 |
+
|
184 |
+
✅ Conclusion: Treatment retrieval is highly effective.
|
185 |
+
|
186 |
+
---
|
187 |
+
|
188 |
+
### ⚠️ 2. `How to manage severe chest pain with difficulty breathing?`
|
189 |
+
|
190 |
+
#### 📌 Emergency Dataset:
|
191 |
+
- `E-1 ~ E-3` are identical dyspnea passages; no actionable steps.
|
192 |
+
- `E-4 ~ E-5` are general symptom overviews, not acute response protocols.
|
193 |
+
|
194 |
+
⚠️ Issue: Semantic match exists, but lacks procedural content.
|
195 |
+
⚠️ Repetition indicates Annoy might be over-focused on a narrow cluster.
|
196 |
+
|
197 |
+
#### 📌 Treatment Dataset:
|
198 |
+
- `T-1 ~ T-3` mention dyspnea and chest pain but are mostly patient descriptions.
|
199 |
+
- `T-4` hints at emergency care for asthma but still lacks clarity.
|
200 |
+
|
201 |
+
⚠️ Conclusion: This query needs better symptom-action co-occurrence modeling.
|
202 |
+
|
203 |
+
---
|
204 |
+
|
205 |
+
### ⚠️ 3. `What are the emergency procedures for anaphylactic shock?`
|
206 |
+
|
207 |
+
#### 📌 Emergency Dataset:
|
208 |
+
- `E-1 ~ E-2`: irrelevant or truncated.
|
209 |
+
- `E-3`: mentions management during anesthesia → partial match.
|
210 |
+
- `E-4 ~ E-5`: just list multiple shock types; no protocol info.
|
211 |
+
|
212 |
+
❌ Emergency dataset lacks focused content on this topic.
|
213 |
+
|
214 |
+
#### 📌 Treatment Dataset:
|
215 |
+
- `T-1`: explicitly lists epinephrine, oxygen, IV fluids, corticosteroids → ✅ ideal
|
216 |
+
- `T-2`: confirms emergency drug prep
|
217 |
+
- `T-3 ~ T-5`: all recognize anaphylactic shock
|
218 |
+
|
219 |
+
✅ Conclusion: Treatment subset captures this case very accurately.
|
220 |
+
|
221 |
+
---
|
222 |
+
|
223 |
+
## 📏 Distance Threshold Reference
|
224 |
+
|
225 |
+
| Distance Value Range | Interpretation |
|
226 |
+
|----------------------|--------------------------------------------|
|
227 |
+
| `< 0.80` | Very strong match (almost identical) |
|
228 |
+
| `0.80 ~ 0.86` | Acceptable semantic match |
|
229 |
+
| `> 0.90` | Weak relevance, possibly off-topic chunks |
|
230 |
+
|
231 |
+
---
|
232 |
+
|
233 |
+
## 🧰 Recommendations Based on Findings
|
234 |
+
|
235 |
+
| Issue Type | Suggested Solution |
|
236 |
+
|
237 |
+
|
238 |
+
(genAIvenv) yanbochen@YanBos-MacBook-Pro tests % python test_embedding_validation.py
|
239 |
+
|
240 |
+
|
241 |
+
=== Query: What is the treatment protocol for acute myocardial infarction? ===
|
242 |
+
Batches: 100%|██████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 6.65it/s]
|
243 |
+
|
244 |
+
Emergency Dataset Results:
|
245 |
+
|
246 |
+
E-1 (distance: 0.826):
|
247 |
+
myocardial infarction, white [ / bib _ ref ].
|
248 |
+
|
249 |
+
E-2 (distance: 0.833):
|
250 |
+
the management of acute myocardial infarction : guidelines and audit standards successful management of acute myocardial infarction depends in the first instance on the patient recognising the symptoms and seeking help as quickly as possible.
|
251 |
+
|
252 |
+
E-3 (distance: 0.836):
|
253 |
+
sandbox : stemi # 2017 esc guidelines for the management of acute myocardial infarction in patients presenting with st - segment elevation # # changes in recommendations # # what is new in 2017 guidelines on ami - stemi? # # ami - stemi - 2017 new recommendations # acc / aats / aha / ase / asnc / scai / scct / sts 2016 appropriate use criteria for coronary revascularization in patients with acute coronary syndromes # # stemi — immediate revascularization by pci # # stemi — initial treatment by fibrinolytic therapy # # stemi — revascularization of nonculprit artery during the initial hospitalization # 2017 aha / acc clinical performance and quality measures for adults with st - elevation and non – st - elevation myocardial infarction # # revised stemi and nstemi measures # # revised stemi and nstemi measures.
|
254 |
+
|
255 |
+
E-4 (distance: 0.842):
|
256 |
+
stemi resident survival guide # overview st elevation myocardial infarction ( stemi ) is a syndrome characterized by the presence of symptoms of myocardial ischemia associated with persistent st elevation on electrocardiogram and elevated cardiac enzymes.
|
257 |
+
|
258 |
+
E-5 (distance: 0.879):
|
259 |
+
# pre - discharge care abbreviations : ace : angiotensin converting enzyme ; lvef : left ventricular ejection fraction ; mi : myocardial infarction ; pci : percutaneous coronary intervention ; po : per os ; stemi : st elevation myocardial infarction ; vf : ventricular fibrillation ; vt : ventricular tachycardia # long term management abbreviations : ace : angiotensin converting enzyme ; arb : angiotensin receptor blocker ; mi : myocardial infarction # do ' s - a pre - hospital ecg is recommended.
|
260 |
+
|
261 |
+
Treatment Dataset Results:
|
262 |
+
|
263 |
+
T-1 (distance: 0.813):
|
264 |
+
intain the standard of care and timely access of patients with ACS, including acute myocardial infarction (AMI), to reperfusion therapy.
|
265 |
+
|
266 |
+
T-2 (distance: 0.825):
|
267 |
+
The Management of Acute Myocardial Infarction: Guidelines and Audit Standards
|
268 |
+
|
269 |
+
Successful management of acute myocardial infarction.
|
270 |
+
|
271 |
+
T-3 (distance: 0.854):
|
272 |
+
fined as STEMI, NSTEMI or unstable angina.
|
273 |
+
|
274 |
+
T-4 (distance: 0.869):
|
275 |
+
Japan, there are no clear guidelines focusing on procedural aspect of the standardized care.
|
276 |
+
|
277 |
+
T-5 (distance: 0.879):
|
278 |
+
ients with acute myocardial infarction (AMI).
|
279 |
+
|
280 |
+
|
281 |
+
=== Query: How to manage severe chest pain with difficulty breathing? ===
|
282 |
+
Batches: 100%|██████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 47.76it/s]
|
283 |
+
|
284 |
+
Emergency Dataset Results:
|
285 |
+
|
286 |
+
E-1 (distance: 0.848):
|
287 |
+
shortness of breath resident survival guide # overview dyspnea is a symptom, it must generally be distinguished from signs that clinicians typically invoke as evidence of respiratory distress, such as tachypnea, use of accessory muscles, and intercostal retractions.
|
288 |
+
|
289 |
+
E-2 (distance: 0.849):
|
290 |
+
shortness of breath resident survival guide # overview dyspnea is a symptom, it must generally be distinguished from signs that clinicians typically invoke as evidence of respiratory distress, such as tachypnea, use of accessory muscles, and intercostal retractions.
|
291 |
+
|
292 |
+
E-3 (distance: 0.852):
|
293 |
+
shortness of breath resident survival guide # overview dyspnea is a symptom, it must generally be distinguished from signs that clinicians typically invoke as evidence of respiratory distress, such as tachypnea, use of accessory muscles, and intercostal retractions.
|
294 |
+
|
295 |
+
E-4 (distance: 0.879):
|
296 |
+
sandbox : milan # overview dyspnea is the uncomfortable awareness of one ' s own breathing.
|
297 |
+
|
298 |
+
E-5 (distance: 0.879):
|
299 |
+
sandbox : milan # overview dyspnea is the uncomfortable awareness of one ' s own breathing.
|
300 |
+
|
301 |
+
Treatment Dataset Results:
|
302 |
+
|
303 |
+
T-1 (distance: 0.827):
|
304 |
+
lly cyanotic and clammy, and may experience dyspnea or chest pain from underperfusion 13 .
|
305 |
+
|
306 |
+
T-2 (distance: 0.868):
|
307 |
+
acterized by a worsening of the patient’s respiratory symptoms (baseline dyspnea, cough, and/or sputum production) that is beyond normal day-to-day variations and leads to a change in medication.
|
308 |
+
|
309 |
+
T-3 (distance: 0.872):
|
310 |
+
ally cyanotic and clammy, and may experience dyspnea or chest pain from underperfusion 13.
|
311 |
+
|
312 |
+
T-4 (distance: 0.898):
|
313 |
+
ce used to test breathing) results show your breathing problems are worsening
|
314 |
+
- you need to go to the emergency room for asthma treatment.
|
315 |
+
|
316 |
+
T-5 (distance: 0.898):
|
317 |
+
breathlessness in a person in the last days of life.
|
318 |
+
|
319 |
+
|
320 |
+
=== Query: What are the emergency procedures for anaphylactic shock? ===
|
321 |
+
Batches: 100%|██████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 57.16it/s]
|
322 |
+
|
323 |
+
Emergency Dataset Results:
|
324 |
+
|
325 |
+
E-1 (distance: 0.924):
|
326 |
+
the other.
|
327 |
+
|
328 |
+
E-2 (distance: 0.943):
|
329 |
+
ic defibrillation.
|
330 |
+
|
331 |
+
E-3 (distance: 0.946):
|
332 |
+
suspected anaphylactic reactions associated with anaesthesia # # summary ( 1 ) the aagbi has published guidance on management of anaphylaxis during anaesthesia in.
|
333 |
+
|
334 |
+
E-4 (distance: 0.952):
|
335 |
+
- gastrointestinal bleeding - perforated peptic ulcer - post - procedural or post - surgical - retroperitoneal hemorrhage - rupture ovarian cyst - trauma - distributive shock - sepsis - toxic shock syndrome - anaphylactic or anaphylactoid reaction - neurogenic shock - adrenal crisis # fire : focused initial rapid evaluation a focused initial rapid evaluation ( fire ) should be performed to identify patients in need of immediate intervention.
|
336 |
+
|
337 |
+
E-5 (distance: 0.954):
|
338 |
+
- surgical - retroperitoneal hemorrhage - rupture ovarian cyst - trauma - distributive shock - sepsis - toxic shock syndrome - anaphylactic or anaphylactoid reaction - neurogenic shock - adrenal crisis # fire : focused initial rapid evaluation a focused initial rapid evaluation ( fire ) should be performed to identify patients in need of immediate intervention.
|
339 |
+
|
340 |
+
Treatment Dataset Results:
|
341 |
+
|
342 |
+
T-1 (distance: 0.813):
|
343 |
+
ensitivity (anaphylactic) reactions require emergency treatment with epinephrine and other emergency measures, that may include airway management, oxygen, intravenous fluids, antihistamines, corticosteroids, and vasopressors as clinically indicated.
|
344 |
+
|
345 |
+
T-2 (distance: 0.833):
|
346 |
+
ave standard emergency treatments for hypersensitivity or anaphylactic reactions readily available in the operating room (e.
|
347 |
+
|
348 |
+
T-3 (distance: 0.838):
|
349 |
+
e, or systemic inflammation (anaphylactic shock).
|
350 |
+
|
351 |
+
T-4 (distance: 0.843):
|
352 |
+
ED AND APPROPRIATE THERAPY INSTITUTED.
|
353 |
+
|
354 |
+
T-5 (distance: 0.844):
|
355 |
+
UED AND APPROPRIATE THERAPY INSTITUTED.
|
tests/test_data_processing.py
ADDED
@@ -0,0 +1,228 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Test script for data_processing.py
|
3 |
+
|
4 |
+
This script tests the basic functionality without running the full pipeline
|
5 |
+
to ensure everything is working correctly before proceeding with embedding generation.
|
6 |
+
"""
|
7 |
+
|
8 |
+
import sys
|
9 |
+
from pathlib import Path
|
10 |
+
import pandas as pd
|
11 |
+
|
12 |
+
# Add src to path
|
13 |
+
sys.path.append(str(Path(__file__).parent.parent.resolve() / "src"))
|
14 |
+
|
15 |
+
from data_processing import DataProcessor
|
16 |
+
import logging
|
17 |
+
|
18 |
+
# Setup logging
|
19 |
+
logging.basicConfig(
|
20 |
+
level=logging.INFO,
|
21 |
+
format='%(levelname)s:%(name)s:%(message)s'
|
22 |
+
)
|
23 |
+
# Silence urllib3 logging
|
24 |
+
logging.getLogger('urllib3').setLevel(logging.WARNING)
|
25 |
+
|
26 |
+
logger = logging.getLogger(__name__)
|
27 |
+
|
28 |
+
def test_data_loading():
|
29 |
+
"""Test data loading functionality"""
|
30 |
+
print("="*50)
|
31 |
+
print("TESTING DATA LOADING")
|
32 |
+
print("="*50)
|
33 |
+
|
34 |
+
try:
|
35 |
+
# Initialize processor with explicit base directory
|
36 |
+
base_dir = Path(__file__).parent.parent.resolve()
|
37 |
+
processor = DataProcessor(base_dir=str(base_dir))
|
38 |
+
|
39 |
+
# Test data loading
|
40 |
+
emergency_data, treatment_data = processor.load_filtered_data()
|
41 |
+
|
42 |
+
print(f"✅ Emergency data loaded: {len(emergency_data)} records")
|
43 |
+
print(f"✅ Treatment data loaded: {len(treatment_data)} records")
|
44 |
+
|
45 |
+
# Check data structure
|
46 |
+
print("\nEmergency data columns:", list(emergency_data.columns))
|
47 |
+
print("Treatment data columns:", list(treatment_data.columns))
|
48 |
+
|
49 |
+
# Show sample data
|
50 |
+
if len(emergency_data) > 0:
|
51 |
+
print(f"\nSample emergency matched keywords: {emergency_data['matched'].iloc[0]}")
|
52 |
+
|
53 |
+
if len(treatment_data) > 0:
|
54 |
+
print(f"Sample treatment matched keywords: {treatment_data['treatment_matched'].iloc[0]}")
|
55 |
+
|
56 |
+
return True
|
57 |
+
|
58 |
+
except Exception as e:
|
59 |
+
print(f"❌ Data loading failed: {e}")
|
60 |
+
return False
|
61 |
+
|
62 |
+
def test_chunking():
|
63 |
+
"""Test chunking functionality"""
|
64 |
+
print("\n" + "="*50)
|
65 |
+
print("TESTING CHUNKING FUNCTIONALITY")
|
66 |
+
print("="*50)
|
67 |
+
|
68 |
+
try:
|
69 |
+
# Initialize processor
|
70 |
+
processor = DataProcessor()
|
71 |
+
|
72 |
+
# Load data
|
73 |
+
processor.load_filtered_data()
|
74 |
+
|
75 |
+
# Test emergency chunking (just first few records)
|
76 |
+
print("Testing emergency chunking...")
|
77 |
+
emergency_chunks = []
|
78 |
+
for idx, row in processor.emergency_data.head(3).iterrows():
|
79 |
+
if pd.notna(row.get('clean_text')) and pd.notna(row.get('matched')):
|
80 |
+
chunks = processor.create_keyword_centered_chunks(
|
81 |
+
text=row['clean_text'],
|
82 |
+
matched_keywords=row['matched'],
|
83 |
+
chunk_size=512,
|
84 |
+
doc_id=str(row.get('id', idx))
|
85 |
+
)
|
86 |
+
emergency_chunks.extend(chunks)
|
87 |
+
|
88 |
+
print(f"✅ Generated {len(emergency_chunks)} emergency chunks from 3 records")
|
89 |
+
|
90 |
+
# Test treatment chunking (just first few records)
|
91 |
+
print("Testing treatment chunking...")
|
92 |
+
treatment_chunks = []
|
93 |
+
for idx, row in processor.treatment_data.head(3).iterrows():
|
94 |
+
if (pd.notna(row.get('clean_text')) and
|
95 |
+
pd.notna(row.get('treatment_matched'))):
|
96 |
+
chunks = processor.create_dual_keyword_chunks(
|
97 |
+
text=row['clean_text'],
|
98 |
+
emergency_keywords=row.get('matched', ''),
|
99 |
+
treatment_keywords=row['treatment_matched'],
|
100 |
+
chunk_size=512,
|
101 |
+
doc_id=str(row.get('id', idx))
|
102 |
+
)
|
103 |
+
treatment_chunks.extend(chunks)
|
104 |
+
|
105 |
+
print(f"✅ Generated {len(treatment_chunks)} treatment chunks from 3 records")
|
106 |
+
|
107 |
+
# Show sample chunk
|
108 |
+
if emergency_chunks:
|
109 |
+
sample_chunk = emergency_chunks[0]
|
110 |
+
print(f"\nSample emergency chunk:")
|
111 |
+
print(f" Primary keyword: {sample_chunk['primary_keyword']}")
|
112 |
+
print(f" Text length: {len(sample_chunk['text'])}")
|
113 |
+
print(f" Text preview: {sample_chunk['text'][:100]}...")
|
114 |
+
|
115 |
+
if treatment_chunks:
|
116 |
+
sample_chunk = treatment_chunks[0]
|
117 |
+
print(f"\nSample treatment chunk:")
|
118 |
+
print(f" Primary keyword: {sample_chunk['primary_keyword']}")
|
119 |
+
print(f" Emergency keywords: {sample_chunk['emergency_keywords']}")
|
120 |
+
print(f" Text length: {len(sample_chunk['text'])}")
|
121 |
+
print(f" Text preview: {sample_chunk['text'][:100]}...")
|
122 |
+
|
123 |
+
return True
|
124 |
+
|
125 |
+
except Exception as e:
|
126 |
+
print(f"❌ Chunking test failed: {e}")
|
127 |
+
import traceback
|
128 |
+
traceback.print_exc()
|
129 |
+
return False
|
130 |
+
|
131 |
+
def test_model_loading():
|
132 |
+
"""Test if we can load the embedding model"""
|
133 |
+
print("\n" + "="*50)
|
134 |
+
print("TESTING MODEL LOADING")
|
135 |
+
print("="*50)
|
136 |
+
|
137 |
+
try:
|
138 |
+
processor = DataProcessor()
|
139 |
+
|
140 |
+
print("Loading NeuML/pubmedbert-base-embeddings...")
|
141 |
+
model = processor.load_embedding_model()
|
142 |
+
|
143 |
+
print(f"✅ Model loaded successfully: {processor.embedding_model_name}")
|
144 |
+
print(f"✅ Model max sequence length: {model.max_seq_length}")
|
145 |
+
|
146 |
+
# Test a simple encoding
|
147 |
+
test_text = "Patient presents with chest pain and shortness of breath."
|
148 |
+
embedding = model.encode([test_text])
|
149 |
+
|
150 |
+
print(f"✅ Test embedding shape: {embedding.shape}")
|
151 |
+
print(f"✅ Expected dimension: {processor.embedding_dim}")
|
152 |
+
|
153 |
+
assert embedding.shape[1] == processor.embedding_dim, f"Dimension mismatch: {embedding.shape[1]} != {processor.embedding_dim}"
|
154 |
+
|
155 |
+
return True
|
156 |
+
|
157 |
+
except Exception as e:
|
158 |
+
print(f"❌ Model loading failed: {e}")
|
159 |
+
import traceback
|
160 |
+
traceback.print_exc()
|
161 |
+
return False
|
162 |
+
|
163 |
+
def test_token_chunking():
|
164 |
+
"""Test token-based chunking functionality"""
|
165 |
+
try:
|
166 |
+
processor = DataProcessor()
|
167 |
+
|
168 |
+
test_text = "Patient presents with acute chest pain radiating to left arm. Initial ECG shows ST elevation."
|
169 |
+
test_keywords = "chest pain|ST elevation"
|
170 |
+
|
171 |
+
chunks = processor.create_keyword_centered_chunks(
|
172 |
+
text=test_text,
|
173 |
+
matched_keywords=test_keywords
|
174 |
+
)
|
175 |
+
|
176 |
+
print(f"\nToken chunking test:")
|
177 |
+
print(f"✓ Generated {len(chunks)} chunks")
|
178 |
+
for i, chunk in enumerate(chunks, 1):
|
179 |
+
print(f"\nChunk {i}:")
|
180 |
+
print(f" Primary keyword: {chunk['primary_keyword']}")
|
181 |
+
print(f" Content: {chunk['text']}")
|
182 |
+
|
183 |
+
return True
|
184 |
+
|
185 |
+
except Exception as e:
|
186 |
+
print(f"❌ Token chunking test failed: {e}")
|
187 |
+
return False
|
188 |
+
|
189 |
+
def main():
|
190 |
+
"""Run all tests"""
|
191 |
+
print("Starting data processing tests...\n")
|
192 |
+
|
193 |
+
# Import pandas here since it's used in chunking test
|
194 |
+
import pandas as pd
|
195 |
+
|
196 |
+
tests = [
|
197 |
+
test_data_loading,
|
198 |
+
test_chunking,
|
199 |
+
test_model_loading,
|
200 |
+
test_token_chunking # Added new test
|
201 |
+
]
|
202 |
+
|
203 |
+
results = []
|
204 |
+
for test in tests:
|
205 |
+
result = test()
|
206 |
+
results.append(result)
|
207 |
+
|
208 |
+
print("\n" + "="*50)
|
209 |
+
print("TEST SUMMARY")
|
210 |
+
print("="*50)
|
211 |
+
|
212 |
+
for i, (test, result) in enumerate(zip(tests, results), 1):
|
213 |
+
status = "✅ PASSED" if result else "❌ FAILED"
|
214 |
+
print(f"{i}. {test.__name__}: {status}")
|
215 |
+
|
216 |
+
all_passed = all(results)
|
217 |
+
|
218 |
+
if all_passed:
|
219 |
+
print("\n🎉 All tests passed! Ready to proceed with full pipeline.")
|
220 |
+
print("\nTo run the full data processing pipeline:")
|
221 |
+
print("cd FinalProject && python src/data_processing.py")
|
222 |
+
else:
|
223 |
+
print("\n⚠️ Some tests failed. Please check the issues above.")
|
224 |
+
|
225 |
+
return all_passed
|
226 |
+
|
227 |
+
if __name__ == "__main__":
|
228 |
+
main()
|
tests/test_embedding_and_index.py
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
from annoy import AnnoyIndex
|
3 |
+
import pytest
|
4 |
+
from data_processing import DataProcessor
|
5 |
+
|
6 |
+
@pytest.fixture(scope="module")
|
7 |
+
def processor():
|
8 |
+
return DataProcessor(base_dir=".")
|
9 |
+
|
10 |
+
def test_embedding_dimensions(processor):
|
11 |
+
# load emergency embeddings
|
12 |
+
emb = np.load(processor.models_dir / "embeddings" / "emergency_embeddings.npy")
|
13 |
+
expected_dim = processor.embedding_dim
|
14 |
+
assert emb.ndim == 2, f"Expected 2D array, got {emb.ndim}D"
|
15 |
+
assert emb.shape[1] == expected_dim, (
|
16 |
+
f"Expected embedding dimension {expected_dim}, got {emb.shape[1]}"
|
17 |
+
)
|
18 |
+
|
19 |
+
def test_annoy_search(processor):
|
20 |
+
# load embeddings
|
21 |
+
emb = np.load(processor.models_dir / "embeddings" / "emergency_embeddings.npy")
|
22 |
+
# load Annoy index
|
23 |
+
idx = AnnoyIndex(processor.embedding_dim, 'angular')
|
24 |
+
idx.load(str(processor.models_dir / "indices" / "annoy" / "emergency_index.ann"))
|
25 |
+
# perform a sample query
|
26 |
+
query_vec = emb[0]
|
27 |
+
ids, distances = idx.get_nns_by_vector(query_vec, 5, include_distances=True)
|
28 |
+
assert len(ids) == 5
|
29 |
+
assert all(0 <= d <= 2 for d in distances)
|
tests/test_embedding_validation.py
ADDED
@@ -0,0 +1,213 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Test suite for validating embeddings and ANNOY functionality.
|
3 |
+
This module ensures the quality of embeddings and the correctness of ANNOY search.
|
4 |
+
"""
|
5 |
+
|
6 |
+
import numpy as np
|
7 |
+
import json
|
8 |
+
import logging
|
9 |
+
import os
|
10 |
+
from pathlib import Path
|
11 |
+
from typing import Tuple, List, Optional
|
12 |
+
from annoy import AnnoyIndex
|
13 |
+
from sentence_transformers import SentenceTransformer
|
14 |
+
|
15 |
+
class TestEmbeddingValidation:
|
16 |
+
def setup_class(self):
|
17 |
+
"""Initialize test environment with necessary data and models."""
|
18 |
+
# Setup logging
|
19 |
+
logging.basicConfig(
|
20 |
+
level=logging.DEBUG,
|
21 |
+
format='%(asctime)s - %(levelname)s - %(message)s',
|
22 |
+
filename='embedding_validation.log'
|
23 |
+
)
|
24 |
+
self.logger = logging.getLogger(__name__)
|
25 |
+
|
26 |
+
# Define base paths
|
27 |
+
self.project_root = Path(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
28 |
+
self.models_dir = self.project_root / "models"
|
29 |
+
self.embeddings_dir = self.models_dir / "embeddings"
|
30 |
+
self.indices_dir = self.models_dir / "indices" / "annoy"
|
31 |
+
|
32 |
+
self.logger.info(f"Project root: {self.project_root}")
|
33 |
+
self.logger.info(f"Models directory: {self.models_dir}")
|
34 |
+
self.logger.info(f"Embeddings directory: {self.embeddings_dir}")
|
35 |
+
|
36 |
+
try:
|
37 |
+
# Check directory existence
|
38 |
+
if not self.embeddings_dir.exists():
|
39 |
+
raise FileNotFoundError(f"Embeddings directory not found at: {self.embeddings_dir}")
|
40 |
+
if not self.indices_dir.exists():
|
41 |
+
raise FileNotFoundError(f"Indices directory not found at: {self.indices_dir}")
|
42 |
+
|
43 |
+
# Load embeddings
|
44 |
+
self.emergency_emb = np.load(self.embeddings_dir / "emergency_embeddings.npy")
|
45 |
+
self.treatment_emb = np.load(self.embeddings_dir / "treatment_embeddings.npy")
|
46 |
+
|
47 |
+
# Load chunks
|
48 |
+
with open(self.embeddings_dir / "emergency_chunks.json", 'r') as f:
|
49 |
+
self.emergency_chunks = json.load(f)
|
50 |
+
with open(self.embeddings_dir / "treatment_chunks.json", 'r') as f:
|
51 |
+
self.treatment_chunks = json.load(f)
|
52 |
+
|
53 |
+
# Initialize model
|
54 |
+
self.model = SentenceTransformer("NeuML/pubmedbert-base-embeddings")
|
55 |
+
|
56 |
+
self.logger.info("Test environment initialized successfully")
|
57 |
+
self.logger.info(f"Emergency embeddings shape: {self.emergency_emb.shape}")
|
58 |
+
self.logger.info(f"Treatment embeddings shape: {self.treatment_emb.shape}")
|
59 |
+
|
60 |
+
except FileNotFoundError as e:
|
61 |
+
self.logger.error(f"File not found: {e}")
|
62 |
+
raise
|
63 |
+
except Exception as e:
|
64 |
+
self.logger.error(f"Error during initialization: {e}")
|
65 |
+
raise
|
66 |
+
|
67 |
+
def _safe_search(
|
68 |
+
self,
|
69 |
+
index: AnnoyIndex,
|
70 |
+
query_vector: np.ndarray,
|
71 |
+
k: int = 5
|
72 |
+
) -> Tuple[Optional[List[int]], Optional[List[float]]]:
|
73 |
+
"""Safe search wrapper with error handling"""
|
74 |
+
try:
|
75 |
+
indices, distances = index.get_nns_by_vector(
|
76 |
+
query_vector, k, include_distances=True
|
77 |
+
)
|
78 |
+
self.logger.debug(f"Search successful: found {len(indices)} results")
|
79 |
+
return indices, distances
|
80 |
+
|
81 |
+
except Exception as e:
|
82 |
+
self.logger.error(f"Search failed: {str(e)}")
|
83 |
+
return None, None
|
84 |
+
|
85 |
+
def test_embedding_dimensions(self):
|
86 |
+
"""Test embedding dimensions and data quality."""
|
87 |
+
self.logger.info("\n=== Embedding Validation Report ===")
|
88 |
+
|
89 |
+
try:
|
90 |
+
# Basic dimension checks
|
91 |
+
assert self.emergency_emb.shape[1] == 768, "Emergency embedding dimension should be 768"
|
92 |
+
assert self.treatment_emb.shape[1] == 768, "Treatment embedding dimension should be 768"
|
93 |
+
|
94 |
+
# Count verification
|
95 |
+
assert len(self.emergency_chunks) == self.emergency_emb.shape[0], \
|
96 |
+
"Emergency chunks count mismatch"
|
97 |
+
assert len(self.treatment_chunks) == self.treatment_emb.shape[0], \
|
98 |
+
"Treatment chunks count mismatch"
|
99 |
+
|
100 |
+
# Data quality checks
|
101 |
+
for name, emb in [("Emergency", self.emergency_emb),
|
102 |
+
("Treatment", self.treatment_emb)]:
|
103 |
+
# Check for NaN and Inf
|
104 |
+
assert not np.isnan(emb).any(), f"{name} contains NaN values"
|
105 |
+
assert not np.isinf(emb).any(), f"{name} contains Inf values"
|
106 |
+
|
107 |
+
# Value distribution analysis
|
108 |
+
self.logger.info(f"\n{name} Embeddings Statistics:")
|
109 |
+
self.logger.info(f"- Range: {np.min(emb):.3f} to {np.max(emb):.3f}")
|
110 |
+
self.logger.info(f"- Mean: {np.mean(emb):.3f}")
|
111 |
+
self.logger.info(f"- Std: {np.std(emb):.3f}")
|
112 |
+
|
113 |
+
self.logger.info("\n✅ All embedding validations passed")
|
114 |
+
|
115 |
+
except AssertionError as e:
|
116 |
+
self.logger.error(f"Validation failed: {str(e)}")
|
117 |
+
raise
|
118 |
+
|
119 |
+
def test_multiple_known_item_search(self):
|
120 |
+
"""Test ANNOY search with multiple random samples."""
|
121 |
+
self.logger.info("\n=== Multiple Known-Item Search Test ===")
|
122 |
+
|
123 |
+
emergency_index = AnnoyIndex(768, 'angular')
|
124 |
+
emergency_index.load(str(self.indices_dir / "emergency_index.ann"))
|
125 |
+
|
126 |
+
# Test 20 random samples
|
127 |
+
test_indices = np.random.choice(
|
128 |
+
self.emergency_emb.shape[0],
|
129 |
+
size=20,
|
130 |
+
replace=False
|
131 |
+
)
|
132 |
+
|
133 |
+
success_count = 0
|
134 |
+
for test_idx in test_indices:
|
135 |
+
try:
|
136 |
+
test_emb = self.emergency_emb[test_idx]
|
137 |
+
indices, distances = self._safe_search(emergency_index, test_emb)
|
138 |
+
|
139 |
+
if indices is None:
|
140 |
+
continue
|
141 |
+
|
142 |
+
# Verify self-retrieval
|
143 |
+
assert indices[0] == test_idx, f"Self-retrieval failed for index {test_idx}"
|
144 |
+
assert distances[0] < 0.0001, f"Self-distance too large for index {test_idx}"
|
145 |
+
success_count += 1
|
146 |
+
|
147 |
+
except AssertionError as e:
|
148 |
+
self.logger.warning(f"Test failed for index {test_idx}: {str(e)}")
|
149 |
+
|
150 |
+
self.logger.info(f"\n✅ {success_count}/20 self-retrieval tests passed")
|
151 |
+
assert success_count >= 18, "Less than 90% of self-retrieval tests passed"
|
152 |
+
|
153 |
+
def test_balanced_cross_dataset_search(self):
|
154 |
+
"""Test search across both emergency and treatment datasets."""
|
155 |
+
self.logger.info("\n=== Balanced Cross-Dataset Search Test ===")
|
156 |
+
|
157 |
+
# Initialize indices
|
158 |
+
emergency_index = AnnoyIndex(768, 'angular')
|
159 |
+
treatment_index = AnnoyIndex(768, 'angular')
|
160 |
+
|
161 |
+
try:
|
162 |
+
emergency_index.load(str(self.indices_dir / "emergency_index.ann"))
|
163 |
+
treatment_index.load(str(self.indices_dir / "treatment_index.ann"))
|
164 |
+
|
165 |
+
# Test queries
|
166 |
+
test_queries = [
|
167 |
+
"What is the treatment protocol for acute myocardial infarction?",
|
168 |
+
"How to manage severe chest pain with difficulty breathing?",
|
169 |
+
"What are the emergency procedures for anaphylactic shock?"
|
170 |
+
]
|
171 |
+
|
172 |
+
for query in test_queries:
|
173 |
+
print(f"\n\n=== Query: {query} ===")
|
174 |
+
|
175 |
+
# Generate query vector
|
176 |
+
query_emb = self.model.encode([query])[0]
|
177 |
+
|
178 |
+
# Get top-5 results from each dataset
|
179 |
+
e_indices, e_distances = self._safe_search(emergency_index, query_emb, k=5)
|
180 |
+
t_indices, t_distances = self._safe_search(treatment_index, query_emb, k=5)
|
181 |
+
|
182 |
+
if None in [e_indices, e_distances, t_indices, t_distances]:
|
183 |
+
self.logger.error("Search failed for one or both datasets")
|
184 |
+
continue
|
185 |
+
|
186 |
+
# Print first sentence of each result
|
187 |
+
print("\nEmergency Dataset Results:")
|
188 |
+
for i, (idx, dist) in enumerate(zip(e_indices, e_distances), 1):
|
189 |
+
text = self.emergency_chunks[idx]['text']
|
190 |
+
first_sentence = text.split('.')[0] + '.'
|
191 |
+
print(f"\nE-{i} (distance: {dist:.3f}):")
|
192 |
+
print(first_sentence)
|
193 |
+
|
194 |
+
print("\nTreatment Dataset Results:")
|
195 |
+
for i, (idx, dist) in enumerate(zip(t_indices, t_distances), 1):
|
196 |
+
text = self.treatment_chunks[idx]['text']
|
197 |
+
first_sentence = text.split('.')[0] + '.'
|
198 |
+
print(f"\nT-{i} (distance: {dist:.3f}):")
|
199 |
+
print(first_sentence)
|
200 |
+
|
201 |
+
except Exception as e:
|
202 |
+
self.logger.error(f"Test failed: {str(e)}")
|
203 |
+
raise
|
204 |
+
else:
|
205 |
+
self.logger.info("\n✅ Cross-dataset search test completed")
|
206 |
+
|
207 |
+
if __name__ == "__main__":
|
208 |
+
# Manual test execution
|
209 |
+
test = TestEmbeddingValidation()
|
210 |
+
test.setup_class()
|
211 |
+
test.test_embedding_dimensions()
|
212 |
+
test.test_multiple_known_item_search()
|
213 |
+
test.test_balanced_cross_dataset_search()
|