Yan-Bo Chen commited on
Commit
b85d6ac
·
2 Parent(s): cd2cfdd 985c260

Merge pull request #2 from YanBoChen0928/embedding

Browse files

### 🔧 Git History Cleanup: Removed Large Files + Forced Push

.gitignore CHANGED
@@ -1,10 +1,34 @@
1
- dataset/dataset/
2
-
3
- #virtual environment
4
  genAIvenv/
5
  .final_project_env/
6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
- .DS_Store
 
 
 
 
 
 
9
 
10
- docs/
 
 
 
 
1
+ # 🧠 Virtual environments
 
 
2
  genAIvenv/
3
  .final_project_env/
4
 
5
+ # 💻 OS / Editor garbage
6
+ .DS_Store
7
+ .vscode/
8
+
9
+ # 📁 Documentation and project folders
10
+ docs/
11
+ dataset/dataset/
12
+
13
+ # 🧾 Compiled / output files
14
+ *.pyc
15
+ *.log
16
+ *.zip
17
+ *.tar.gz
18
+ *.mp4
19
+ *.mov
20
+ *.json
21
+ *.png
22
 
23
+ # 🚫 Large files - models
24
+ models/cache/
25
+ models/cache/*.pkl
26
+ models/embeddings/*.npy
27
+ models/embeddings/*.json
28
+ models/indices/
29
+ models/indices/annoy/*.ann
30
 
31
+ # 🚫 Redundant catch-all for large file extensions
32
+ *.pkl
33
+ *.npy
34
+ *.ann
dataset/analysis/integrity_check/integrity_check_report.json DELETED
@@ -1,29 +0,0 @@
1
- {
2
- "sample_analysis": {
3
- "matched": {
4
- "non_null": 100,
5
- "non_empty": 100,
6
- "unique_values": 84
7
- },
8
- "treatment_matched": {
9
- "non_null": 100,
10
- "non_empty": 100,
11
- "unique_values": 100
12
- }
13
- },
14
- "full_file_analysis": {
15
- "total_records": 9367,
16
- "matched_column": {
17
- "non_null_count": 9367,
18
- "non_empty_count": 9367,
19
- "null_percentage": 0.0
20
- },
21
- "treatment_matched_column": {
22
- "non_null_count": 9367,
23
- "non_empty_count": 9367,
24
- "null_percentage": 0.0
25
- },
26
- "both_matched_count": 3315,
27
- "both_matched_percentage": 35.39019963702359
28
- }
29
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dataset/analysis/keyword_matching_test_results.json DELETED
@@ -1,151 +0,0 @@
1
- {
2
- "special_terms_matching": [
3
- {
4
- "clean_text": "Patient needs an x-ray of the chest",
5
- "category": "x-ray variants",
6
- "matched": "x-ray"
7
- },
8
- {
9
- "clean_text": "Ordered chest xray",
10
- "category": "x-ray variants",
11
- "matched": "xray"
12
- },
13
- {
14
- "clean_text": "X ray shows pneumonia",
15
- "category": "x-ray variants",
16
- "matched": "X ray"
17
- },
18
- {
19
- "clean_text": "XRAY negative",
20
- "category": "x-ray variants",
21
- "matched": "XRAY"
22
- },
23
- {
24
- "clean_text": "CT scan reveals nodule",
25
- "category": "ct-scan variants",
26
- "matched": "CT scan"
27
- },
28
- {
29
- "clean_text": "CT-scan indicates mass",
30
- "category": "ct-scan variants",
31
- "matched": "CT-scan"
32
- },
33
- {
34
- "clean_text": "Requires ctscan urgently",
35
- "category": "ct-scan variants",
36
- "matched": "ctscan"
37
- },
38
- {
39
- "clean_text": "CTSCAN of abdomen",
40
- "category": "ct-scan variants",
41
- "matched": "CTSCAN"
42
- },
43
- {
44
- "clean_text": "Point-of-care testing needed",
45
- "category": "point-of-care variants",
46
- "matched": "Point-of-care"
47
- },
48
- {
49
- "clean_text": "Point of care ultrasound",
50
- "category": "point-of-care variants",
51
- "matched": "Point of care"
52
- },
53
- {
54
- "clean_text": "POC testing results",
55
- "category": "point-of-care variants",
56
- "matched": ""
57
- },
58
- {
59
- "clean_text": "Ordered both x-ray and CT scan",
60
- "category": "mixed cases",
61
- "matched": "x-ray|CT scan"
62
- },
63
- {
64
- "clean_text": "XRAY and CTSCAN negative",
65
- "category": "mixed cases",
66
- "matched": "XRAY|CTSCAN"
67
- },
68
- {
69
- "clean_text": "Multiple point-of-care tests with x-ray",
70
- "category": "mixed cases",
71
- "matched": "point-of-care|x-ray"
72
- },
73
- {
74
- "clean_text": "No imaging mentioned",
75
- "category": "negative cases",
76
- "matched": ""
77
- },
78
- {
79
- "clean_text": "Regular examination only",
80
- "category": "negative cases",
81
- "matched": ""
82
- },
83
- {
84
- "clean_text": "Laboratory tests pending",
85
- "category": "negative cases",
86
- "matched": ""
87
- }
88
- ],
89
- "basic_matching": [
90
- {
91
- "clean_text": "Emergency treatment required",
92
- "category": "simple matches",
93
- "matched": "Emergency"
94
- },
95
- {
96
- "clean_text": "Acute condition observed",
97
- "category": "simple matches",
98
- "matched": "Acute"
99
- },
100
- {
101
- "clean_text": "Urgent care needed",
102
- "category": "simple matches",
103
- "matched": "Urgent"
104
- },
105
- {
106
- "clean_text": "EMERGENCY situation",
107
- "category": "case variations",
108
- "matched": "EMERGENCY"
109
- },
110
- {
111
- "clean_text": "Acute RESPIRATORY failure",
112
- "category": "case variations",
113
- "matched": "Acute"
114
- },
115
- {
116
- "clean_text": "URgent surgical intervention",
117
- "category": "case variations",
118
- "matched": "URgent"
119
- },
120
- {
121
- "clean_text": "Emergency treatment for acute condition",
122
- "category": "multiple matches",
123
- "matched": "Emergency|acute"
124
- },
125
- {
126
- "clean_text": "Urgent care in emergency department",
127
- "category": "multiple matches",
128
- "matched": "Urgent|emergency"
129
- },
130
- {
131
- "clean_text": "Acute respiratory emergency",
132
- "category": "multiple matches",
133
- "matched": "Acute|emergency"
134
- },
135
- {
136
- "clean_text": "Non-emergency situation",
137
- "category": "partial words",
138
- "matched": "emergency"
139
- },
140
- {
141
- "clean_text": "Subacute condition",
142
- "category": "partial words",
143
- "matched": ""
144
- },
145
- {
146
- "clean_text": "Emergency-related",
147
- "category": "partial words",
148
- "matched": "Emergency"
149
- }
150
- ]
151
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dataset/analysis/stats/analysis_stats_emergency_subset.json DELETED
@@ -1,55 +0,0 @@
1
- {
2
- "basic_statistics": {
3
- "total_records": 10282,
4
- "avg_length": 25185.078194903715
5
- },
6
- "keyword_statistics": {
7
- "Acute abdomen": 52,
8
- "Acute bleeding": 31,
9
- "Acute Coronary Syndrome": 345,
10
- "Acute Kidney Injury": 202,
11
- "Acute pancreatitis": 214,
12
- "Acute respiratory distress syndrome": 231,
13
- "Acute stroke": 67,
14
- "Anaphylaxis": 1016,
15
- "Anaphylactic Shock": 153,
16
- "Arrhythmia": 1547,
17
- "Atrial fibrillation": 771,
18
- "Atrial flutter": 146,
19
- "Bradycardia": 884,
20
- "Cardiac arrest": 614,
21
- "Cardiogenic Shock": 196,
22
- "Chest pain": 1433,
23
- "Dyspnea": 1319,
24
- "Fever": 4270,
25
- "Gastrointestinal Hemorrhage": 158,
26
- "GI bleeding": 105,
27
- "Hemorrhage": 1611,
28
- "Hemorrhagic stroke": 117,
29
- "Hyperthermia": 305,
30
- "Hypovolemic Shock": 63,
31
- "Hypotension": 1929,
32
- "Hypothermia": 356,
33
- "Internal bleeding": 70,
34
- "Intracranial Hemorrhages": 6,
35
- "Ischemic stroke": 224,
36
- "Loss of consciousness": 422,
37
- "Myocardial Infarction": 1708,
38
- "MI": 10183,
39
- "Pulmonary Edema": 487,
40
- "Pulmonary Embolism": 654,
41
- "Respiratory distress": 730,
42
- "Respiratory failure": 579,
43
- "Sepsis": 1181,
44
- "Severe Sepsis": 81,
45
- "Septic Shock": 244,
46
- "Shock": 1881,
47
- "Status Epilepticus": 150,
48
- "Syncope": 834,
49
- "Tachycardia": 1650,
50
- "Tachypnea": 268,
51
- "Traumatic Brain Injury": 171,
52
- "Ventricular Tachycardia": 491,
53
- "Ventricular fibrillation": 295
54
- }
55
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dataset/analysis/stats/analysis_stats_emergency_subset_opt.json DELETED
@@ -1,55 +0,0 @@
1
- {
2
- "basic_statistics": {
3
- "total_records": 11914,
4
- "avg_length": 23847.07579318449
5
- },
6
- "keyword_statistics": {
7
- "Acute abdomen": 52,
8
- "Acute bleeding": 31,
9
- "Acute Coronary Syndrome": 351,
10
- "Acute Kidney Injury": 202,
11
- "Acute pancreatitis": 214,
12
- "Acute respiratory distress syndrome": 231,
13
- "Acute stroke": 67,
14
- "Anaphylaxis": 1016,
15
- "Anaphylactic Shock": 153,
16
- "Arrhythmia": 1564,
17
- "Atrial fibrillation": 771,
18
- "Atrial flutter": 146,
19
- "Bradycardia": 884,
20
- "Cardiac arrest": 614,
21
- "Cardiogenic Shock": 196,
22
- "Chest pain": 1434,
23
- "Dyspnea": 1319,
24
- "Fever": 4279,
25
- "Gastrointestinal Hemorrhage": 158,
26
- "GI bleeding": 105,
27
- "Hemorrhage": 1621,
28
- "Hemorrhagic stroke": 117,
29
- "Hyperthermia": 305,
30
- "Hypovolemic Shock": 63,
31
- "Hypotension": 1929,
32
- "Hypothermia": 356,
33
- "Internal bleeding": 70,
34
- "Intracranial Hemorrhages": 6,
35
- "Ischemic stroke": 225,
36
- "Loss of consciousness": 422,
37
- "Myocardial Infarction": 1710,
38
- "MI": 11773,
39
- "Pulmonary Edema": 487,
40
- "Pulmonary Embolism": 654,
41
- "Respiratory distress": 730,
42
- "Respiratory failure": 579,
43
- "Sepsis": 1188,
44
- "Severe Sepsis": 81,
45
- "Septic Shock": 244,
46
- "Shock": 1892,
47
- "Status Epilepticus": 150,
48
- "Syncope": 834,
49
- "Tachycardia": 1651,
50
- "Tachypnea": 268,
51
- "Traumatic Brain Injury": 171,
52
- "Ventricular Tachycardia": 492,
53
- "Ventricular fibrillation": 295
54
- }
55
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dataset/analysis_treatment/stats/treatment_analysis_comprehensive.json DELETED
@@ -1,293 +0,0 @@
1
- {
2
- "basic_statistics": {
3
- "total_records": 9367,
4
- "avg_text_length": 27179.22952919825,
5
- "emergency_keywords_count": 47,
6
- "treatment_keywords_count": 105
7
- },
8
- "emergency_keyword_stats": {
9
- "Acute abdomen": 51,
10
- "Acute bleeding": 31,
11
- "Acute Coronary Syndrome": 332,
12
- "Acute Kidney Injury": 200,
13
- "Acute pancreatitis": 202,
14
- "Acute respiratory distress syndrome": 225,
15
- "Acute stroke": 65,
16
- "Anaphylaxis": 1002,
17
- "Anaphylactic Shock": 148,
18
- "Arrhythmia": 1490,
19
- "Atrial fibrillation": 736,
20
- "Atrial flutter": 139,
21
- "Bradycardia": 845,
22
- "Cardiac arrest": 600,
23
- "Cardiogenic Shock": 192,
24
- "Chest pain": 1408,
25
- "Dyspnea": 1296,
26
- "Fever": 4008,
27
- "Gastrointestinal Hemorrhage": 158,
28
- "GI bleeding": 103,
29
- "Hemorrhage": 1532,
30
- "Hemorrhagic stroke": 109,
31
- "Hyperthermia": 283,
32
- "Hypovolemic Shock": 61,
33
- "Hypotension": 1897,
34
- "Hypothermia": 340,
35
- "Internal bleeding": 67,
36
- "Intracranial Hemorrhages": 5,
37
- "Ischemic stroke": 216,
38
- "Loss of consciousness": 406,
39
- "Myocardial Infarction": 1607,
40
- "MI": 9316,
41
- "Pulmonary Edema": 471,
42
- "Pulmonary Embolism": 624,
43
- "Respiratory distress": 713,
44
- "Respiratory failure": 554,
45
- "Sepsis": 1145,
46
- "Severe Sepsis": 81,
47
- "Septic Shock": 231,
48
- "Shock": 1702,
49
- "Status Epilepticus": 149,
50
- "Syncope": 806,
51
- "Tachycardia": 1576,
52
- "Tachypnea": 262,
53
- "Traumatic Brain Injury": 151,
54
- "Ventricular Tachycardia": 461,
55
- "Ventricular fibrillation": 280
56
- },
57
- "treatment_keyword_stats": {
58
- "ACLS": 30,
59
- "administer": 3881,
60
- "Adrenaline": 135,
61
- "Advanced Cardiac Life Support": 34,
62
- "Airway Management": 174,
63
- "alpha blocker": 35,
64
- "Amiodarone": 315,
65
- "analgesia": 323,
66
- "Anesthesia Procedural": 0,
67
- "Anti-Bacterial Agents": 1,
68
- "antibiotic": 1922,
69
- "arterial line placement": 0,
70
- "beta blocker": 297,
71
- "Bi-level Positive Airway Pressure": 6,
72
- "bipap": 25,
73
- "Blood Transfusion": 379,
74
- "Bosmin": 0,
75
- "Cardiopulmonary Resuscitation": 131,
76
- "Cardioversion": 142,
77
- "Catheterization Arterial": 0,
78
- "Catheterization Central Venous": 0,
79
- "central line placement": 6,
80
- "compression dressing": 2,
81
- "Computed Tomography": 518,
82
- "cpap": 84,
83
- "cpr": 151,
84
- "crystalloids": 45,
85
- "ct scan": 1036,
86
- "Defibrillation": 96,
87
- "Dopamine": 389,
88
- "Dosage Forms": 210,
89
- "dose": 5344,
90
- "Drug Administration Routes": 0,
91
- "Drug Therapy": 773,
92
- "Epinephrine": 806,
93
- "fluid": 2938,
94
- "fluid resuscitation": 115,
95
- "hemodynamic monitoring": 43,
96
- "Hemodynamics": 135,
97
- "Hemostasis": 180,
98
- "Ibuprofen": 269,
99
- "icu transfer": 9,
100
- "Insulin": 808,
101
- "intervention": 2695,
102
- "intubation": 493,
103
- "Intratracheal Intubation": 3,
104
- "Intravenous Infusion": 576,
105
- "iv fluids": 75,
106
- "laboratory techniques": 29,
107
- "laboratory testing": 296,
108
- "levophed": 11,
109
- "Lidocaine": 212,
110
- "manage": 4416,
111
- "management": 4008,
112
- "medication": 4698,
113
- "midazolam": 204,
114
- "monitor": 4521,
115
- "monitoring": 3593,
116
- "Morphine": 289,
117
- "Nebulization": 41,
118
- "nitroglycerin": 125,
119
- "NTG": 81,
120
- "Norepinephrine": 392,
121
- "normal saline": 252,
122
- "Ondansetron": 43,
123
- "Oxygen": 1779,
124
- "Oxygen Inhalation Therapy": 2,
125
- "oxygen therapy": 178,
126
- "Patient Management": 281,
127
- "Patient Monitoring": 107,
128
- "POCUS": 10,
129
- "point of care ultrasound": 2,
130
- "procedural sedation": 26,
131
- "procedure": 3073,
132
- "radiologic imaging": 5,
133
- "Radiography": 218,
134
- "resuscitation": 539,
135
- "Sedation": 602,
136
- "splinting": 26,
137
- "Splints": 29,
138
- "supportive care": 564,
139
- "surgical procedures": 482,
140
- "Surgical Procedures Operative": 0,
141
- "surgery": 3531,
142
- "Suture": 179,
143
- "Suturing": 53,
144
- "Therapeutic Intervention": 181,
145
- "Therapeutics": 182,
146
- "Therapy": 6117,
147
- "tourniquet": 56,
148
- "transfusion": 826,
149
- "treat": 8270,
150
- "treatment": 7719,
151
- "Ultrasonography Point of Care": 0,
152
- "ultrasound": 1273,
153
- "Vasoconstrictor Agents": 2,
154
- "vasopressors": 188,
155
- "ventilation support": 14,
156
- "Ventilators": 86,
157
- "Vital Signs": 459,
158
- "vital signs monitoring": 1,
159
- "wound care": 73,
160
- "Wound Dressing": 30,
161
- "Wound Management": 37,
162
- "X-Ray": 1293
163
- },
164
- "cooccurrence_analysis": [
165
- {
166
- "emergency_keyword": "Fever",
167
- "treatment_keyword": "treatment",
168
- "cooccurrence_count": 3488,
169
- "percentage": 37.23710899967973
170
- },
171
- {
172
- "emergency_keyword": "Fever",
173
- "treatment_keyword": "Therapy",
174
- "cooccurrence_count": 2698,
175
- "percentage": 28.803245436105477
176
- },
177
- {
178
- "emergency_keyword": "Fever",
179
- "treatment_keyword": "dose",
180
- "cooccurrence_count": 2430,
181
- "percentage": 25.94213729048788
182
- },
183
- {
184
- "emergency_keyword": "Fever",
185
- "treatment_keyword": "medication",
186
- "cooccurrence_count": 1979,
187
- "percentage": 21.127362015586634
188
- },
189
- {
190
- "emergency_keyword": "Hypotension",
191
- "treatment_keyword": "treatment",
192
- "cooccurrence_count": 1760,
193
- "percentage": 18.789366926443897
194
- },
195
- {
196
- "emergency_keyword": "Fever",
197
- "treatment_keyword": "management",
198
- "cooccurrence_count": 1753,
199
- "percentage": 18.714636489804633
200
- },
201
- {
202
- "emergency_keyword": "Fever",
203
- "treatment_keyword": "treat",
204
- "cooccurrence_count": 1744,
205
- "percentage": 18.618554499839863
206
- },
207
- {
208
- "emergency_keyword": "Fever",
209
- "treatment_keyword": "monitoring",
210
- "cooccurrence_count": 1674,
211
- "percentage": 17.87125013344721
212
- },
213
- {
214
- "emergency_keyword": "Hypotension",
215
- "treatment_keyword": "Therapy",
216
- "cooccurrence_count": 1558,
217
- "percentage": 16.63286004056795
218
- },
219
- {
220
- "emergency_keyword": "Fever",
221
- "treatment_keyword": "surgery",
222
- "cooccurrence_count": 1505,
223
- "percentage": 16.06704387744208
224
- },
225
- {
226
- "emergency_keyword": "Tachycardia",
227
- "treatment_keyword": "treatment",
228
- "cooccurrence_count": 1441,
229
- "percentage": 15.383794171025942
230
- },
231
- {
232
- "emergency_keyword": "Hypotension",
233
- "treatment_keyword": "dose",
234
- "cooccurrence_count": 1423,
235
- "percentage": 15.191630191096403
236
- },
237
- {
238
- "emergency_keyword": "Myocardial Infarction",
239
- "treatment_keyword": "treatment",
240
- "cooccurrence_count": 1369,
241
- "percentage": 14.615138251307783
242
- },
243
- {
244
- "emergency_keyword": "Shock",
245
- "treatment_keyword": "treatment",
246
- "cooccurrence_count": 1340,
247
- "percentage": 14.305540728087967
248
- },
249
- {
250
- "emergency_keyword": "Fever",
251
- "treatment_keyword": "fluid",
252
- "cooccurrence_count": 1330,
253
- "percentage": 14.198782961460447
254
- },
255
- {
256
- "emergency_keyword": "Hemorrhage",
257
- "treatment_keyword": "treatment",
258
- "cooccurrence_count": 1328,
259
- "percentage": 14.177431408134941
260
- },
261
- {
262
- "emergency_keyword": "Hypotension",
263
- "treatment_keyword": "monitoring",
264
- "cooccurrence_count": 1325,
265
- "percentage": 14.145404078146683
266
- },
267
- {
268
- "emergency_keyword": "Tachycardia",
269
- "treatment_keyword": "Therapy",
270
- "cooccurrence_count": 1277,
271
- "percentage": 13.632966798334579
272
- },
273
- {
274
- "emergency_keyword": "Dyspnea",
275
- "treatment_keyword": "treatment",
276
- "cooccurrence_count": 1228,
277
- "percentage": 13.10985374185972
278
- },
279
- {
280
- "emergency_keyword": "Myocardial Infarction",
281
- "treatment_keyword": "Therapy",
282
- "cooccurrence_count": 1215,
283
- "percentage": 12.97106864524394
284
- }
285
- ],
286
- "path_b_validation": {
287
- "avg_emergency_density": 0.3098621434407273,
288
- "avg_treatment_density": 0.6108515041451529,
289
- "high_density_records": 1298,
290
- "precision_estimate": 0.9995729689334899
291
- },
292
- "condition_mapping_candidates": {}
293
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dataset/scripts/data_explorer_treatment.py CHANGED
@@ -2,7 +2,7 @@
2
  import pandas as pd
3
  import matplotlib.pyplot as plt
4
  import seaborn as sns
5
- import numpy as np
6
  from pathlib import Path
7
  import json
8
  from tqdm import tqdm
 
2
  import pandas as pd
3
  import matplotlib.pyplot as plt
4
  import seaborn as sns
5
+ # Removed duplicate import of numpy
6
  from pathlib import Path
7
  import json
8
  from tqdm import tqdm
requirements.txt CHANGED
@@ -64,6 +64,7 @@ safehttpx==0.1.6
64
  safetensors==0.5.3
65
  seaborn==0.13.2
66
  semantic-version==2.10.0
 
67
  shellingham==1.5.4
68
  six==1.17.0
69
  sniffio==1.3.1
 
64
  safetensors==0.5.3
65
  seaborn==0.13.2
66
  semantic-version==2.10.0
67
+ sentence-transformers==3.0.1
68
  shellingham==1.5.4
69
  six==1.17.0
70
  sniffio==1.3.1
src/__init__.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ """
2
+ OnCall.ai src package
3
+
4
+ This package contains the core implementation of the OnCall.ai system.
5
+ """
6
+
7
+ # Version
8
+ __version__ = '0.1.0'
src/data_processing.py ADDED
@@ -0,0 +1,698 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ OnCall.ai Data Processing Module
3
+
4
+ This module handles:
5
+ 1. Loading filtered medical guideline data
6
+ 2. Creating intelligent chunks based on matched keywords
7
+ 3. Generating embeddings using NeuML/pubmedbert-base-embeddings
8
+ 4. Building ANNOY indices for vector search
9
+ 5. Data quality validation
10
+
11
+ Author: OnCall.ai Team
12
+ Date: 2025-07-26
13
+ """
14
+
15
+ # Required imports for core functionality
16
+ import json
17
+ import pandas as pd
18
+ import numpy as np
19
+ from pathlib import Path
20
+ from typing import List, Dict, Tuple, Any
21
+ from sentence_transformers import SentenceTransformer
22
+ from annoy import AnnoyIndex
23
+ import logging
24
+ from tqdm import tqdm
25
+
26
+ # Setup logging
27
+ logging.basicConfig(
28
+ level=logging.INFO, # change between INFO and DEBUG level
29
+ format='%(levelname)s:%(name)s:%(message)s'
30
+ )
31
+ logger = logging.getLogger(__name__)
32
+
33
+ # Explicitly define what should be exported
34
+ __all__ = ['DataProcessor']
35
+
36
+ class DataProcessor:
37
+ """Main data processing class for OnCall.ai RAG system"""
38
+
39
+ def __init__(self, base_dir: str = None):
40
+ """
41
+ Initialize DataProcessor
42
+
43
+ Args:
44
+ base_dir: Base directory path for the project
45
+ """
46
+ self.base_dir = Path(base_dir).resolve() if base_dir else Path(__file__).parent.parent.resolve()
47
+ self.dataset_dir = (self.base_dir / "dataset" / "dataset").resolve() # modify to actual dataset directory
48
+ self.models_dir = (self.base_dir / "models").resolve()
49
+
50
+ # Model configuration
51
+ self.embedding_model_name = "NeuML/pubmedbert-base-embeddings"
52
+ self.embedding_dim = 768 # PubMedBERT dimension
53
+ self.chunk_size = 256 # Changed to tokens instead of characters
54
+ self.chunk_overlap = 64 # Added overlap configuration
55
+
56
+ # Initialize model and tokenizer (will be loaded when needed)
57
+ self.embedding_model = None
58
+ self.tokenizer = None
59
+
60
+ # Data containers
61
+ self.emergency_data = None
62
+ self.treatment_data = None
63
+ self.emergency_chunks = []
64
+ self.treatment_chunks = []
65
+
66
+ # Initialize indices
67
+ self.emergency_index = None
68
+ self.treatment_index = None
69
+
70
+ logger.info(f"Initialized DataProcessor with:")
71
+ logger.info(f" Base directory: {self.base_dir}")
72
+ logger.info(f" Dataset directory: {self.dataset_dir}")
73
+ logger.info(f" Models directory: {self.models_dir}")
74
+ logger.info(f" Chunk size (tokens): {self.chunk_size}")
75
+ logger.info(f" Chunk overlap (tokens): {self.chunk_overlap}")
76
+
77
+ def load_embedding_model(self):
78
+ """Load the embedding model and initialize tokenizer"""
79
+ if self.embedding_model is None:
80
+ logger.info(f"Loading embedding model: {self.embedding_model_name}")
81
+ self.embedding_model = SentenceTransformer(self.embedding_model_name)
82
+ self.tokenizer = self.embedding_model.tokenizer
83
+ logger.info("Embedding model and tokenizer loaded successfully")
84
+ return self.embedding_model
85
+
86
+ def load_filtered_data(self) -> Tuple[pd.DataFrame, pd.DataFrame]:
87
+ """
88
+ Load pre-filtered emergency and treatment data
89
+
90
+ Returns:
91
+ Tuple of (emergency_data, treatment_data) DataFrames
92
+ """
93
+ logger.info("Loading filtered medical data...")
94
+
95
+ # File paths
96
+ emergency_path = (self.dataset_dir / "emergency" / "emergency_subset_opt.jsonl").resolve()
97
+ treatment_path = (self.dataset_dir / "emergency_treatment" / "emergency_treatment_subset_opt.jsonl").resolve()
98
+
99
+ logger.info(f"Looking for emergency data at: {emergency_path}")
100
+ logger.info(f"Looking for treatment data at: {treatment_path}")
101
+
102
+ # Validate file existence
103
+ if not emergency_path.exists():
104
+ raise FileNotFoundError(f"Emergency data not found: {emergency_path}")
105
+ if not treatment_path.exists():
106
+ raise FileNotFoundError(f"Treatment data not found: {treatment_path}")
107
+
108
+ # Load data
109
+ self.emergency_data = pd.read_json(str(emergency_path), lines=True) # 使用 str() 确保路径正确处理
110
+ self.treatment_data = pd.read_json(str(treatment_path), lines=True)
111
+
112
+ logger.info(f"Loaded {len(self.emergency_data)} emergency records")
113
+ logger.info(f"Loaded {len(self.treatment_data)} treatment records")
114
+
115
+ return self.emergency_data, self.treatment_data
116
+
117
+ def create_keyword_centered_chunks(self, text: str, matched_keywords: str,
118
+ chunk_size: int = None, doc_id: str = None) -> List[Dict[str, Any]]:
119
+ """
120
+ Create chunks centered around matched keywords using tokenizer
121
+
122
+ Args:
123
+ text: Input text
124
+ matched_keywords: Pipe-separated keywords (e.g., "MI|chest pain|fever")
125
+ chunk_size: Size of each chunk in tokens (defaults to self.chunk_size)
126
+ doc_id: Document ID for tracking
127
+
128
+ Returns:
129
+ List of chunk dictionaries
130
+ """
131
+ if not matched_keywords or pd.isna(matched_keywords):
132
+ return []
133
+
134
+ # Load model if not loaded (to get tokenizer)
135
+ if self.tokenizer is None:
136
+ self.load_embedding_model()
137
+
138
+ # Convert text and keywords to lowercase at the start
139
+ text = text.lower()
140
+ keywords = [kw.lower() for kw in matched_keywords.split("|")] if matched_keywords else []
141
+
142
+ chunk_size = chunk_size or self.chunk_size
143
+ chunks = []
144
+
145
+ # Calculate character-to-token ratio using a sample around the first keyword
146
+ if keywords:
147
+ first_keyword = keywords[0]
148
+ first_pos = text.find(first_keyword)
149
+ if first_pos != -1:
150
+ # Take a sample around the first keyword for ratio calculation
151
+ sample_start = max(0, first_pos - 100)
152
+ sample_end = min(len(text), first_pos + len(first_keyword) + 100)
153
+ sample_text = text[sample_start:sample_end]
154
+ sample_tokens = len(self.tokenizer.tokenize(sample_text))
155
+ chars_per_token = len(sample_text) / sample_tokens if sample_tokens > 0 else 4.0
156
+ else:
157
+ chars_per_token = 4.0 # Fallback ratio
158
+ else:
159
+ chars_per_token = 4.0 # Default ratio
160
+
161
+ # Process keywords
162
+ for i, keyword in enumerate(keywords):
163
+ # Find keyword position in text (already lowercase)
164
+ keyword_pos = text.find(keyword)
165
+
166
+ if keyword_pos != -1:
167
+ # Get the keyword text (already lowercase)
168
+ actual_keyword = text[keyword_pos:keyword_pos + len(keyword)]
169
+
170
+ # Calculate rough window size using dynamic ratio
171
+ # Cap the rough chunk target token size to prevent tokenizer warnings
172
+ # Use 512 tokens as target (model's max limit)
173
+ ROUGH_CHUNK_TARGET_TOKENS = 512
174
+ char_window = int(ROUGH_CHUNK_TARGET_TOKENS * chars_per_token / 2)
175
+
176
+ # Get rough chunk boundaries in characters
177
+ rough_start = max(0, keyword_pos - char_window)
178
+ rough_end = min(len(text), keyword_pos + len(keyword) + char_window)
179
+
180
+ # Extract rough chunk for processing
181
+ rough_chunk = text[rough_start:rough_end]
182
+
183
+ # Find keyword's relative position in rough chunk
184
+ rel_pos = rough_chunk.find(actual_keyword)
185
+ if rel_pos == -1:
186
+ logger.debug(f"Could not locate keyword '{actual_keyword}' in rough chunk for doc {doc_id}")
187
+ continue
188
+
189
+ # Calculate token position by tokenizing text before keyword
190
+ text_before = rough_chunk[:rel_pos]
191
+ tokens_before = self.tokenizer.tokenize(text_before)
192
+ keyword_start_pos = len(tokens_before)
193
+
194
+ # Tokenize necessary parts
195
+ chunk_tokens = self.tokenizer.tokenize(rough_chunk)
196
+ keyword_tokens = self.tokenizer.tokenize(actual_keyword)
197
+ keyword_length = len(keyword_tokens)
198
+
199
+ # Calculate final chunk boundaries in tokens
200
+ tokens_each_side = (chunk_size - keyword_length) // 2
201
+ chunk_start = max(0, keyword_start_pos - tokens_each_side)
202
+ chunk_end = min(len(chunk_tokens), keyword_start_pos + keyword_length + tokens_each_side)
203
+
204
+ # Add overlap if possible
205
+ if chunk_start > 0:
206
+ chunk_start = max(0, chunk_start - self.chunk_overlap)
207
+ if chunk_end < len(chunk_tokens):
208
+ chunk_end = min(len(chunk_tokens), chunk_end + self.chunk_overlap)
209
+
210
+ # Extract final tokens and convert to text
211
+ final_tokens = chunk_tokens[chunk_start:chunk_end]
212
+ chunk_text = self.tokenizer.convert_tokens_to_string(final_tokens)
213
+
214
+ # Verify keyword presence in final chunk
215
+ if chunk_text and actual_keyword in chunk_text:
216
+ chunk_info = {
217
+ "text": chunk_text,
218
+ "primary_keyword": actual_keyword,
219
+ "all_matched_keywords": matched_keywords.lower(),
220
+ "token_count": len(final_tokens),
221
+ "chunk_id": f"{doc_id}_chunk_{i}" if doc_id else f"chunk_{i}",
222
+ "source_doc_id": doc_id
223
+ }
224
+ chunks.append(chunk_info)
225
+ else:
226
+ logger.debug(f"Could not create chunk for keyword '{actual_keyword}' in doc {doc_id}")
227
+
228
+ if chunks:
229
+ logger.debug(f"Created {len(chunks)} chunks for document {doc_id or 'unknown'}")
230
+
231
+ return chunks
232
+
233
+ def create_dual_keyword_chunks(self, text: str, emergency_keywords: str,
234
+ treatment_keywords: str, chunk_size: int = 512,
235
+ doc_id: str = None) -> List[Dict[str, Any]]:
236
+ """
237
+ Create chunks for treatment data with both emergency and treatment keywords
238
+
239
+ Args:
240
+ text: Input text
241
+ emergency_keywords: Emergency keywords
242
+ treatment_keywords: Treatment keywords
243
+ chunk_size: Size of each chunk
244
+ doc_id: Document ID for tracking
245
+
246
+ Returns:
247
+ List of chunk dictionaries
248
+ """
249
+ if not treatment_keywords or pd.isna(treatment_keywords):
250
+ return []
251
+
252
+ chunks = []
253
+ em_keywords = emergency_keywords.split("|") if emergency_keywords else []
254
+ tr_keywords = treatment_keywords.split("|") if treatment_keywords else []
255
+
256
+ # Process treatment keywords as primary (since this is treatment-focused data)
257
+ for i, tr_keyword in enumerate(tr_keywords):
258
+ tr_pos = text.lower().find(tr_keyword.lower())
259
+
260
+ if tr_pos != -1:
261
+ # Find closest emergency keyword for context
262
+ closest_em_keyword = None
263
+ closest_distance = float('inf')
264
+
265
+ for em_keyword in em_keywords:
266
+ em_pos = text.lower().find(em_keyword.lower())
267
+ if em_pos != -1:
268
+ distance = abs(tr_pos - em_pos)
269
+ if distance < closest_distance and distance < chunk_size:
270
+ closest_distance = distance
271
+ closest_em_keyword = em_keyword
272
+
273
+ # Calculate chunk boundaries
274
+ if closest_em_keyword:
275
+ # Center between both keywords
276
+ em_pos = text.lower().find(closest_em_keyword.lower())
277
+ center = (tr_pos + em_pos) // 2
278
+ else:
279
+ # Center on treatment keyword
280
+ center = tr_pos
281
+
282
+ start = max(0, center - chunk_size // 2)
283
+ end = min(len(text), center + chunk_size // 2)
284
+
285
+ chunk_text = text[start:end].strip()
286
+
287
+ if chunk_text:
288
+ chunk_info = {
289
+ "text": chunk_text,
290
+ "primary_keyword": tr_keyword,
291
+ "emergency_keywords": emergency_keywords,
292
+ "treatment_keywords": treatment_keywords,
293
+ "closest_emergency_keyword": closest_em_keyword,
294
+ "keyword_distance": closest_distance if closest_em_keyword else None,
295
+ "chunk_start": start,
296
+ "chunk_end": end,
297
+ "chunk_id": f"{doc_id}_treatment_chunk_{i}" if doc_id else f"treatment_chunk_{i}",
298
+ "source_doc_id": doc_id
299
+ }
300
+ chunks.append(chunk_info)
301
+
302
+ return chunks
303
+
304
+ def process_emergency_chunks(self) -> List[Dict[str, Any]]:
305
+ """Process emergency data into chunks"""
306
+ if self.emergency_data is None:
307
+ raise ValueError("Emergency data not loaded. Call load_filtered_data() first.")
308
+
309
+ all_chunks = []
310
+
311
+ # Add progress bar with leave=False to avoid cluttering
312
+ for idx, row in tqdm(self.emergency_data.iterrows(),
313
+ total=len(self.emergency_data),
314
+ desc="Processing emergency documents",
315
+ unit="doc",
316
+ leave=False):
317
+ if pd.notna(row.get('clean_text')) and pd.notna(row.get('matched')):
318
+ chunks = self.create_keyword_centered_chunks(
319
+ text=row['clean_text'],
320
+ matched_keywords=row['matched'],
321
+ chunk_size=self.chunk_size,
322
+ doc_id=str(row.get('id', idx))
323
+ )
324
+
325
+ # Add metadata to each chunk
326
+ for chunk in chunks:
327
+ chunk.update({
328
+ 'source_type': 'emergency',
329
+ 'source_title': row.get('title', ''),
330
+ 'source_url': row.get('url', ''),
331
+ 'has_emergency': row.get('has_emergency', True),
332
+ 'doc_type': row.get('type', 'emergency')
333
+ })
334
+
335
+ all_chunks.extend(chunks)
336
+
337
+ self.emergency_chunks = all_chunks
338
+ logger.info(f"Completed processing emergency data: {len(all_chunks)} chunks generated")
339
+ return all_chunks
340
+
341
+ def process_treatment_chunks(self) -> List[Dict[str, Any]]:
342
+ """Process treatment data into chunks"""
343
+ if self.treatment_data is None:
344
+ raise ValueError("Treatment data not loaded. Call load_filtered_data() first.")
345
+
346
+ all_chunks = []
347
+
348
+ # Add progress bar with leave=False to avoid cluttering
349
+ for idx, row in tqdm(self.treatment_data.iterrows(),
350
+ total=len(self.treatment_data),
351
+ desc="Processing treatment documents",
352
+ unit="doc",
353
+ leave=False):
354
+ if (pd.notna(row.get('clean_text')) and
355
+ pd.notna(row.get('treatment_matched'))):
356
+
357
+ chunks = self.create_dual_keyword_chunks(
358
+ text=row['clean_text'],
359
+ emergency_keywords=row.get('matched', ''),
360
+ treatment_keywords=row['treatment_matched'],
361
+ chunk_size=self.chunk_size,
362
+ doc_id=str(row.get('id', idx))
363
+ )
364
+
365
+ # Add metadata to each chunk
366
+ for chunk in chunks:
367
+ chunk.update({
368
+ 'source_type': 'treatment',
369
+ 'source_title': row.get('title', ''),
370
+ 'source_url': row.get('url', ''),
371
+ 'has_emergency': row.get('has_emergency', True),
372
+ 'has_treatment': row.get('has_treatment', True),
373
+ 'doc_type': row.get('type', 'treatment')
374
+ })
375
+
376
+ all_chunks.extend(chunks)
377
+
378
+ self.treatment_chunks = all_chunks
379
+ logger.info(f"Completed processing treatment data: {len(all_chunks)} chunks generated")
380
+ return all_chunks
381
+
382
+ def _get_chunk_hash(self, text: str) -> str:
383
+ """Generate hash for chunk text to use as cache key"""
384
+ import hashlib
385
+ return hashlib.md5(text.encode('utf-8')).hexdigest()
386
+
387
+ def _load_embedding_cache(self, cache_file: str) -> dict:
388
+ """Load embedding cache from file"""
389
+ import pickle
390
+ import os
391
+ if os.path.exists(cache_file):
392
+ try:
393
+ with open(cache_file, 'rb') as f:
394
+ return pickle.load(f)
395
+ except:
396
+ logger.warning(f"Could not load cache file {cache_file}, starting fresh")
397
+ return {}
398
+ return {}
399
+
400
+ def _save_embedding_cache(self, cache: dict, cache_file: str):
401
+ """Save embedding cache to file"""
402
+ import pickle
403
+ import os
404
+ os.makedirs(os.path.dirname(cache_file), exist_ok=True)
405
+ with open(cache_file, 'wb') as f:
406
+ pickle.dump(cache, f)
407
+
408
+ def generate_embeddings(self, chunks: List[Dict[str, Any]],
409
+ chunk_type: str = "emergency") -> np.ndarray:
410
+ """
411
+ Generate embeddings for chunks with caching support
412
+
413
+ Args:
414
+ chunks: List of chunk dictionaries
415
+ chunk_type: Type of chunks ("emergency" or "treatment")
416
+
417
+ Returns:
418
+ numpy array of embeddings
419
+ """
420
+ logger.info(f"Starting embedding generation for {len(chunks)} {chunk_type} chunks...")
421
+
422
+ # Cache setup
423
+ cache_dir = self.models_dir / "cache"
424
+ cache_dir.mkdir(parents=True, exist_ok=True)
425
+ cache_file = cache_dir / f"{chunk_type}_embeddings_cache.pkl"
426
+
427
+ # Load existing cache
428
+ cache = self._load_embedding_cache(str(cache_file))
429
+
430
+ cached_embeddings = []
431
+ to_embed = []
432
+
433
+ # Check cache for each chunk
434
+ for i, chunk in enumerate(chunks):
435
+ chunk_hash = self._get_chunk_hash(chunk['text'])
436
+ if chunk_hash in cache:
437
+ cached_embeddings.append((i, cache[chunk_hash]))
438
+ else:
439
+ to_embed.append((i, chunk_hash, chunk['text']))
440
+
441
+ logger.info(f"Cache status: {len(cached_embeddings)} cached, {len(to_embed)} new chunks to embed")
442
+
443
+ # Generate embeddings for new chunks
444
+ new_embeddings = []
445
+ if to_embed:
446
+ # Load model
447
+ model = self.load_embedding_model()
448
+ texts = [text for _, _, text in to_embed]
449
+
450
+ # Generate embeddings in batches with clear progress
451
+ batch_size = 32
452
+ total_batches = (len(texts) + batch_size - 1) // batch_size
453
+
454
+ logger.info(f"Processing {len(texts)} new {chunk_type} texts in {total_batches} batches...")
455
+
456
+ for i in tqdm(range(0, len(texts), batch_size),
457
+ desc=f"Embedding {chunk_type} subset",
458
+ total=total_batches,
459
+ unit="batch",
460
+ leave=False):
461
+ batch_texts = texts[i:i + batch_size]
462
+ batch_emb = model.encode(
463
+ batch_texts,
464
+ show_progress_bar=False
465
+ )
466
+ new_embeddings.extend(batch_emb)
467
+
468
+ # Update cache with new embeddings
469
+ for (_, chunk_hash, _), emb in zip(to_embed, new_embeddings):
470
+ cache[chunk_hash] = emb
471
+
472
+ # Save updated cache
473
+ self._save_embedding_cache(cache, str(cache_file))
474
+ logger.info(f"Updated cache with {len(new_embeddings)} new embeddings")
475
+
476
+ # Combine cached and new embeddings in correct order
477
+ all_embeddings = [None] * len(chunks)
478
+
479
+ # Place cached embeddings
480
+ for idx, emb in cached_embeddings:
481
+ all_embeddings[idx] = emb
482
+
483
+ # Place new embeddings
484
+ for (idx, _, _), emb in zip(to_embed, new_embeddings):
485
+ all_embeddings[idx] = emb
486
+
487
+ # Convert to numpy array
488
+ result = np.vstack(all_embeddings)
489
+ logger.info(f"Completed embedding generation: shape {result.shape}")
490
+
491
+ return result
492
+
493
+ def build_annoy_index(self, embeddings: np.ndarray,
494
+ index_name: str, n_trees: int = 15) -> AnnoyIndex:
495
+ """
496
+ Build ANNOY index from embeddings
497
+
498
+ Args:
499
+ embeddings: Numpy array of embeddings
500
+ index_name: Name for the index file
501
+ n_trees: Number of trees for ANNOY index
502
+
503
+ Returns:
504
+ Built ANNOY index
505
+ """
506
+ logger.info(f"Building ANNOY index: {index_name}")
507
+
508
+ # Create ANNOY index
509
+ index = AnnoyIndex(self.embedding_dim, 'angular') # angular = cosine similarity
510
+
511
+ # Add vectors to index
512
+ for i, embedding in enumerate(embeddings):
513
+ index.add_item(i, embedding)
514
+
515
+ # Build index
516
+ index.build(n_trees)
517
+
518
+ # Save index
519
+ index_path = self.models_dir / "indices" / "annoy" / f"{index_name}.ann"
520
+ index_path.parent.mkdir(parents=True, exist_ok=True)
521
+ index.save(str(index_path))
522
+
523
+ logger.info(f"ANNOY index saved to: {index_path}")
524
+ return index
525
+
526
+ def save_chunks_and_embeddings(self, chunks: List[Dict[str, Any]],
527
+ embeddings: np.ndarray, chunk_type: str):
528
+ """
529
+ Save chunks metadata and embeddings
530
+
531
+ Args:
532
+ chunks: List of chunk dictionaries
533
+ embeddings: Numpy array of embeddings
534
+ chunk_type: Type of chunks ("emergency" or "treatment")
535
+ """
536
+ logger.info(f"Saving {chunk_type} chunks and embeddings...")
537
+
538
+ # Create output directories
539
+ embeddings_dir = self.models_dir / "embeddings"
540
+ embeddings_dir.mkdir(parents=True, exist_ok=True)
541
+
542
+ # Save chunks metadata
543
+ chunks_file = embeddings_dir / f"{chunk_type}_chunks.json"
544
+ with open(chunks_file, 'w', encoding='utf-8') as f:
545
+ json.dump(chunks, f, ensure_ascii=False, indent=2)
546
+
547
+ # Save embeddings
548
+ embeddings_file = embeddings_dir / f"{chunk_type}_embeddings.npy"
549
+ np.save(embeddings_file, embeddings)
550
+
551
+ logger.info(f"Saved {chunk_type} data:")
552
+ logger.info(f" - Chunks: {chunks_file}")
553
+ logger.info(f" - Embeddings: {embeddings_file}")
554
+
555
+ def validate_data_quality(self) -> Dict[str, Any]:
556
+ """
557
+ Validate data quality and return statistics
558
+
559
+ Returns:
560
+ Dictionary with validation statistics
561
+ """
562
+ logger.info("Validating data quality...")
563
+
564
+ validation_report = {
565
+ "emergency_data": {},
566
+ "treatment_data": {},
567
+ "chunks": {},
568
+ "embeddings": {}
569
+ }
570
+
571
+ # Emergency data validation
572
+ if self.emergency_data is not None:
573
+ validation_report["emergency_data"] = {
574
+ "total_records": len(self.emergency_data),
575
+ "records_with_text": self.emergency_data['clean_text'].notna().sum(),
576
+ "records_with_keywords": self.emergency_data['matched'].notna().sum(),
577
+ "avg_text_length": self.emergency_data['clean_text'].str.len().mean()
578
+ }
579
+
580
+ # Treatment data validation
581
+ if self.treatment_data is not None:
582
+ validation_report["treatment_data"] = {
583
+ "total_records": len(self.treatment_data),
584
+ "records_with_text": self.treatment_data['clean_text'].notna().sum(),
585
+ "records_with_emergency_keywords": self.treatment_data['matched'].notna().sum(),
586
+ "records_with_treatment_keywords": self.treatment_data['treatment_matched'].notna().sum(),
587
+ "avg_text_length": self.treatment_data['clean_text'].str.len().mean()
588
+ }
589
+
590
+ # Chunks validation
591
+ validation_report["chunks"] = {
592
+ "emergency_chunks": len(self.emergency_chunks),
593
+ "treatment_chunks": len(self.treatment_chunks),
594
+ "total_chunks": len(self.emergency_chunks) + len(self.treatment_chunks)
595
+ }
596
+
597
+ if self.emergency_chunks:
598
+ avg_chunk_length = np.mean([len(chunk['text']) for chunk in self.emergency_chunks])
599
+ validation_report["chunks"]["avg_emergency_chunk_length"] = avg_chunk_length
600
+
601
+ if self.treatment_chunks:
602
+ avg_chunk_length = np.mean([len(chunk['text']) for chunk in self.treatment_chunks])
603
+ validation_report["chunks"]["avg_treatment_chunk_length"] = avg_chunk_length
604
+
605
+ # Check if embeddings exist
606
+ embeddings_dir = self.models_dir / "embeddings"
607
+ if embeddings_dir.exists():
608
+ emergency_emb_file = embeddings_dir / "emergency_embeddings.npy"
609
+ treatment_emb_file = embeddings_dir / "treatment_embeddings.npy"
610
+
611
+ validation_report["embeddings"] = {
612
+ "emergency_embeddings_exist": emergency_emb_file.exists(),
613
+ "treatment_embeddings_exist": treatment_emb_file.exists()
614
+ }
615
+
616
+ if emergency_emb_file.exists():
617
+ emb = np.load(emergency_emb_file)
618
+ validation_report["embeddings"]["emergency_embeddings_shape"] = emb.shape
619
+
620
+ if treatment_emb_file.exists():
621
+ emb = np.load(treatment_emb_file)
622
+ validation_report["embeddings"]["treatment_embeddings_shape"] = emb.shape
623
+
624
+ # Save validation report
625
+ report_file = self.models_dir / "data_validation_report.json"
626
+ with open(report_file, 'w', encoding='utf-8') as f:
627
+ json.dump(validation_report, f, indent=2, default=str)
628
+
629
+ logger.info(f"Validation report saved to: {report_file}")
630
+ return validation_report
631
+
632
+ def process_all_data(self) -> Dict[str, Any]:
633
+ """
634
+ Complete data processing pipeline
635
+
636
+ Returns:
637
+ Processing summary
638
+ """
639
+ logger.info("Starting complete data processing pipeline...")
640
+
641
+ # Step 1: Load filtered data
642
+ self.load_filtered_data()
643
+
644
+ # Step 2: Process chunks
645
+ emergency_chunks = self.process_emergency_chunks()
646
+ treatment_chunks = self.process_treatment_chunks()
647
+
648
+ # Step 3: Generate embeddings
649
+ emergency_embeddings = self.generate_embeddings(emergency_chunks, "emergency")
650
+ treatment_embeddings = self.generate_embeddings(treatment_chunks, "treatment")
651
+
652
+ # Step 4: Build ANNOY indices
653
+ self.emergency_index = self.build_annoy_index(emergency_embeddings, "emergency_index")
654
+ self.treatment_index = self.build_annoy_index(treatment_embeddings, "treatment_index")
655
+
656
+ # Step 5: Save data
657
+ self.save_chunks_and_embeddings(emergency_chunks, emergency_embeddings, "emergency")
658
+ self.save_chunks_and_embeddings(treatment_chunks, treatment_embeddings, "treatment")
659
+
660
+ # Step 6: Validate data quality
661
+ validation_report = self.validate_data_quality()
662
+
663
+ # Summary
664
+ summary = {
665
+ "status": "completed",
666
+ "emergency_chunks": len(emergency_chunks),
667
+ "treatment_chunks": len(treatment_chunks),
668
+ "emergency_embeddings_shape": emergency_embeddings.shape,
669
+ "treatment_embeddings_shape": treatment_embeddings.shape,
670
+ "indices_created": ["emergency_index.ann", "treatment_index.ann"],
671
+ "validation_report": validation_report
672
+ }
673
+
674
+ logger.info("Data processing pipeline completed successfully!")
675
+ logger.info(f"Summary: {summary}")
676
+
677
+ return summary
678
+
679
+ def main():
680
+ """Main function for testing the data processor"""
681
+ # Initialize processor
682
+ processor = DataProcessor()
683
+
684
+ # Run complete pipeline
685
+ summary = processor.process_all_data()
686
+
687
+ print("\n" + "="*50)
688
+ print("DATA PROCESSING COMPLETED")
689
+ print("="*50)
690
+ print(f"Emergency chunks: {summary['emergency_chunks']}")
691
+ print(f"Treatment chunks: {summary['treatment_chunks']}")
692
+ print(f"Emergency embeddings: {summary['emergency_embeddings_shape']}")
693
+ print(f"Treatment embeddings: {summary['treatment_embeddings_shape']}")
694
+ print(f"Indices created: {summary['indices_created']}")
695
+ print("="*50)
696
+
697
+ if __name__ == "__main__":
698
+ main()
tests/embedding_test_analysis.md ADDED
@@ -0,0 +1,355 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Embedding Test Analysis Report
2
+
3
+ ## 1. Dataset Overview
4
+
5
+ ### 1.1 Data Dimensions
6
+ - Emergency Dataset: 27,493 chunks × 768 dimensions
7
+ - Treatment Dataset: 82,378 chunks × 768 dimensions
8
+ - Total Chunks: 109,871
9
+
10
+ ### 1.2 Embedding Statistics
11
+
12
+ **Emergency Embeddings:**
13
+ - Value Range: -3.246 to 3.480
14
+ - Mean: -0.017
15
+ - Standard Deviation: 0.462
16
+
17
+ **Treatment Embeddings:**
18
+ - Value Range: -3.686 to 3.505
19
+ - Mean: -0.017
20
+ - Standard Deviation: 0.472
21
+
22
+ **Analysis:**
23
+ - Both datasets have similar statistical properties
24
+ - Mean values are centered around zero (-0.017)
25
+ - Standard deviations are comparable (0.462 vs 0.472)
26
+ - Treatment dataset has slightly wider range (-3.686 to 3.505 vs -3.246 to 3.480)
27
+
28
+ ## 2. Model Performance
29
+
30
+ ### 2.1 Self-Retrieval Test
31
+ - Test Size: 20 random samples
32
+ - Success Rate: 19/20 (95%)
33
+ - Failed Case: Index 27418
34
+ - Average Response Time: ~5ms per search
35
+
36
+ **Observations:**
37
+ - High success rate in self-retrieval (95%)
38
+ - One failure case needs investigation
39
+ - Search operations are consistently fast
40
+
41
+ ### 2.2 Cross-Dataset Search Performance
42
+
43
+ **Test Queries:**
44
+ 1. "What is the treatment protocol for acute myocardial infarction?"
45
+ 2. "How to manage severe chest pain with difficulty breathing?"
46
+ 3. "What are the emergency procedures for anaphylactic shock?"
47
+
48
+ **Key Findings:**
49
+ - Each query returns top-5 results from both datasets
50
+ - Results show semantic understanding (not just keyword matching)
51
+ - First sentences provide good context for relevance assessment
52
+
53
+ ## 3. System Performance
54
+
55
+ ### 3.1 Response Times
56
+ - Model Loading: ~3 seconds
57
+ - Embedding Validation: ~0.5 seconds
58
+ - Search Operations: 0.1-0.2 seconds per query
59
+
60
+ ### 3.2 Resource Usage
61
+ - Model loaded on MPS (Metal Performance Shaders)
62
+ - Efficient memory usage for large datasets
63
+ - Fast vector operations
64
+
65
+ ## 4. Recommendations
66
+
67
+ ### 4.1 Immediate Improvements
68
+ 1. Investigate failed self-retrieval case (index 27418)
69
+ 2. Consider caching frequently accessed embeddings
70
+ 3. Add more diverse test queries
71
+
72
+ ### 4.2 Future Enhancements
73
+ 1. Implement hybrid search (combine with BM25)
74
+ 2. Add relevance scoring mechanism
75
+ 3. Consider domain-specific test cases
76
+
77
+ ## 5. Log Analysis
78
+
79
+ ### 5.1 Log Structure
80
+ ```
81
+ timestamp - level - message
82
+ ```
83
+
84
+ ### 5.2 Log Levels Used
85
+ - DEBUG: Detailed operation info
86
+ - INFO: General progress and results
87
+ - WARNING: Non-critical issues
88
+ - ERROR: Critical failures
89
+
90
+ ### 5.3 Key Log Categories
91
+ 1. **Initialization Logs:**
92
+ - Path configurations
93
+ - Model loading
94
+ - Dataset loading
95
+
96
+ 2. **Performance Logs:**
97
+ - Search operations
98
+ - Response times
99
+ - Success/failure counts
100
+
101
+ 3. **Error Logs:**
102
+ - Failed searches
103
+ - Validation errors
104
+ - Connection issues
105
+
106
+ ### 5.4 Notable Log Patterns
107
+ - Regular HTTPS connections to HuggingFace
108
+ - Consistent search operation timing
109
+ - Clear error messages for failures
110
+
111
+
112
+ <!-- split -->
113
+
114
+
115
+ # 🧪 Embedding Test Analysis Report | 向量嵌入測試分析報告
116
+
117
+ ## 1. Dataset Overview | 資料集總覽
118
+
119
+ ### 1.1 Data Dimensions | 資料維度
120
+ - **Emergency Dataset**: 27,493 chunks × 768 dimensions
121
+ - **Treatment Dataset**: 82,378 chunks × 768 dimensions
122
+ - **Total Chunks**: 109,871
123
+
124
+ ### 1.2 Embedding Statistics | 向量統計
125
+ **Emergency Embeddings 緊急資料集嵌入向量:**
126
+ - Value Range 範圍: -3.246 ~ 3.480
127
+ - Mean 平均值: -0.017
128
+ - Std 標準差: 0.462
129
+
130
+ **Treatment Embeddings 治療資料集嵌入向量:**
131
+ - Value Range 範圍: -3.686 ~ 3.505
132
+ - Mean 平均值: -0.017
133
+ - Std 標準差: 0.472
134
+
135
+ **Analysis 分析:**
136
+ - 兩組資料集中向量分布接近,平均值均接近 0
137
+ - Treatment 資料集範圍稍寬,可能涵蓋更廣語意
138
+
139
+ ---
140
+
141
+ ## 2. Model Performance | 模型檢索表現
142
+
143
+ ### 2.1 Self-Retrieval Test | 自我召回測試
144
+ - 測試數量 Test Size: 20
145
+ - 成功率 Success Rate: **95% (19/20)**
146
+ - 失敗案例 Failed Index: `27418`
147
+ - 平均搜尋時間 Avg Search Time: ~5ms
148
+
149
+ **Observation 觀察:**
150
+ - 自我召回成功率高,顯示索引構建準確
151
+ - 可進一步針對失敗樣本檢查切 chunk 是否過短
152
+
153
+
154
+ <!-- Details -->
155
+
156
+ # 🔍 Embedding Search Analysis Report (Emergency vs Treatment)
157
+
158
+ ## 📊 Overall Summary
159
+
160
+ | Query | Emergency Results | Treatment Results | Summary Comment |
161
+ |---------------------------------------------------------|------------------------|------------------------|-----------------------------------------------|
162
+ | 1️⃣ Treatment for Acute Myocardial Infarction | ✅ Matched well | ✅ Highly relevant | Relevant guidelines retrieved from both sets |
163
+ | 2️⃣ Management of Severe Chest Pain with Dyspnea | ⚠️ Redundant, not focused | ⚠️ Vague and general | Lacks actionable steps, contains repetition |
164
+ | 3️⃣ Emergency Procedures for Anaphylactic Shock | ⚠️ Off-topic | ✅ Precise and relevant | Emergency off-topic, but Treatment is strong |
165
+
166
+ ---
167
+
168
+ ## ��� Detailed Query Analysis
169
+
170
+ ### ✅ 1. `What is the treatment protocol for acute myocardial infarction?`
171
+
172
+ #### 📌 Emergency Dataset:
173
+ - `E-2 ~ E-4` mention guidelines, STEMI, PCI.
174
+ - Distances range from `0.833 ~ 0.842` → valid.
175
+ - `E-3` is a long guideline chunk → ideal RAG candidate.
176
+
177
+ ✅ Conclusion: Emergency subset performs well, keyword chunking effective.
178
+
179
+ #### 📌 Treatment Dataset:
180
+ - `T-1` and `T-2` directly address the question with guideline phrases.
181
+ - `distance ~0.813` → strong semantic match.
182
+ - `T-5` is shorter but still contains “AMI”.
183
+
184
+ ✅ Conclusion: Treatment retrieval is highly effective.
185
+
186
+ ---
187
+
188
+ ### ⚠️ 2. `How to manage severe chest pain with difficulty breathing?`
189
+
190
+ #### 📌 Emergency Dataset:
191
+ - `E-1 ~ E-3` are identical dyspnea passages; no actionable steps.
192
+ - `E-4 ~ E-5` are general symptom overviews, not acute response protocols.
193
+
194
+ ⚠️ Issue: Semantic match exists, but lacks procedural content.
195
+ ⚠️ Repetition indicates Annoy might be over-focused on a narrow cluster.
196
+
197
+ #### 📌 Treatment Dataset:
198
+ - `T-1 ~ T-3` mention dyspnea and chest pain but are mostly patient descriptions.
199
+ - `T-4` hints at emergency care for asthma but still lacks clarity.
200
+
201
+ ⚠️ Conclusion: This query needs better symptom-action co-occurrence modeling.
202
+
203
+ ---
204
+
205
+ ### ⚠️ 3. `What are the emergency procedures for anaphylactic shock?`
206
+
207
+ #### 📌 Emergency Dataset:
208
+ - `E-1 ~ E-2`: irrelevant or truncated.
209
+ - `E-3`: mentions management during anesthesia → partial match.
210
+ - `E-4 ~ E-5`: just list multiple shock types; no protocol info.
211
+
212
+ ❌ Emergency dataset lacks focused content on this topic.
213
+
214
+ #### 📌 Treatment Dataset:
215
+ - `T-1`: explicitly lists epinephrine, oxygen, IV fluids, corticosteroids → ✅ ideal
216
+ - `T-2`: confirms emergency drug prep
217
+ - `T-3 ~ T-5`: all recognize anaphylactic shock
218
+
219
+ ✅ Conclusion: Treatment subset captures this case very accurately.
220
+
221
+ ---
222
+
223
+ ## 📏 Distance Threshold Reference
224
+
225
+ | Distance Value Range | Interpretation |
226
+ |----------------------|--------------------------------------------|
227
+ | `< 0.80` | Very strong match (almost identical) |
228
+ | `0.80 ~ 0.86` | Acceptable semantic match |
229
+ | `> 0.90` | Weak relevance, possibly off-topic chunks |
230
+
231
+ ---
232
+
233
+ ## 🧰 Recommendations Based on Findings
234
+
235
+ | Issue Type | Suggested Solution |
236
+
237
+
238
+ (genAIvenv) yanbochen@YanBos-MacBook-Pro tests % python test_embedding_validation.py
239
+
240
+
241
+ === Query: What is the treatment protocol for acute myocardial infarction? ===
242
+ Batches: 100%|██████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 6.65it/s]
243
+
244
+ Emergency Dataset Results:
245
+
246
+ E-1 (distance: 0.826):
247
+ myocardial infarction, white [ / bib _ ref ].
248
+
249
+ E-2 (distance: 0.833):
250
+ the management of acute myocardial infarction : guidelines and audit standards successful management of acute myocardial infarction depends in the first instance on the patient recognising the symptoms and seeking help as quickly as possible.
251
+
252
+ E-3 (distance: 0.836):
253
+ sandbox : stemi # 2017 esc guidelines for the management of acute myocardial infarction in patients presenting with st - segment elevation # # changes in recommendations # # what is new in 2017 guidelines on ami - stemi? # # ami - stemi - 2017 new recommendations # acc / aats / aha / ase / asnc / scai / scct / sts 2016 appropriate use criteria for coronary revascularization in patients with acute coronary syndromes # # stemi — immediate revascularization by pci # # stemi — initial treatment by fibrinolytic therapy # # stemi — revascularization of nonculprit artery during the initial hospitalization # 2017 aha / acc clinical performance and quality measures for adults with st - elevation and non – st - elevation myocardial infarction # # revised stemi and nstemi measures # # revised stemi and nstemi measures.
254
+
255
+ E-4 (distance: 0.842):
256
+ stemi resident survival guide # overview st elevation myocardial infarction ( stemi ) is a syndrome characterized by the presence of symptoms of myocardial ischemia associated with persistent st elevation on electrocardiogram and elevated cardiac enzymes.
257
+
258
+ E-5 (distance: 0.879):
259
+ # pre - discharge care abbreviations : ace : angiotensin converting enzyme ; lvef : left ventricular ejection fraction ; mi : myocardial infarction ; pci : percutaneous coronary intervention ; po : per os ; stemi : st elevation myocardial infarction ; vf : ventricular fibrillation ; vt : ventricular tachycardia # long term management abbreviations : ace : angiotensin converting enzyme ; arb : angiotensin receptor blocker ; mi : myocardial infarction # do ' s - a pre - hospital ecg is recommended.
260
+
261
+ Treatment Dataset Results:
262
+
263
+ T-1 (distance: 0.813):
264
+ intain the standard of care and timely access of patients with ACS, including acute myocardial infarction (AMI), to reperfusion therapy.
265
+
266
+ T-2 (distance: 0.825):
267
+ The Management of Acute Myocardial Infarction: Guidelines and Audit Standards
268
+
269
+ Successful management of acute myocardial infarction.
270
+
271
+ T-3 (distance: 0.854):
272
+ fined as STEMI, NSTEMI or unstable angina.
273
+
274
+ T-4 (distance: 0.869):
275
+ Japan, there are no clear guidelines focusing on procedural aspect of the standardized care.
276
+
277
+ T-5 (distance: 0.879):
278
+ ients with acute myocardial infarction (AMI).
279
+
280
+
281
+ === Query: How to manage severe chest pain with difficulty breathing? ===
282
+ Batches: 100%|██████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 47.76it/s]
283
+
284
+ Emergency Dataset Results:
285
+
286
+ E-1 (distance: 0.848):
287
+ shortness of breath resident survival guide # overview dyspnea is a symptom, it must generally be distinguished from signs that clinicians typically invoke as evidence of respiratory distress, such as tachypnea, use of accessory muscles, and intercostal retractions.
288
+
289
+ E-2 (distance: 0.849):
290
+ shortness of breath resident survival guide # overview dyspnea is a symptom, it must generally be distinguished from signs that clinicians typically invoke as evidence of respiratory distress, such as tachypnea, use of accessory muscles, and intercostal retractions.
291
+
292
+ E-3 (distance: 0.852):
293
+ shortness of breath resident survival guide # overview dyspnea is a symptom, it must generally be distinguished from signs that clinicians typically invoke as evidence of respiratory distress, such as tachypnea, use of accessory muscles, and intercostal retractions.
294
+
295
+ E-4 (distance: 0.879):
296
+ sandbox : milan # overview dyspnea is the uncomfortable awareness of one ' s own breathing.
297
+
298
+ E-5 (distance: 0.879):
299
+ sandbox : milan # overview dyspnea is the uncomfortable awareness of one ' s own breathing.
300
+
301
+ Treatment Dataset Results:
302
+
303
+ T-1 (distance: 0.827):
304
+ lly cyanotic and clammy, and may experience dyspnea or chest pain from underperfusion 13 .
305
+
306
+ T-2 (distance: 0.868):
307
+ acterized by a worsening of the patient’s respiratory symptoms (baseline dyspnea, cough, and/or sputum production) that is beyond normal day-to-day variations and leads to a change in medication.
308
+
309
+ T-3 (distance: 0.872):
310
+ ally cyanotic and clammy, and may experience dyspnea or chest pain from underperfusion 13.
311
+
312
+ T-4 (distance: 0.898):
313
+ ce used to test breathing) results show your breathing problems are worsening
314
+ - you need to go to the emergency room for asthma treatment.
315
+
316
+ T-5 (distance: 0.898):
317
+ breathlessness in a person in the last days of life.
318
+
319
+
320
+ === Query: What are the emergency procedures for anaphylactic shock? ===
321
+ Batches: 100%|██████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 57.16it/s]
322
+
323
+ Emergency Dataset Results:
324
+
325
+ E-1 (distance: 0.924):
326
+ the other.
327
+
328
+ E-2 (distance: 0.943):
329
+ ic defibrillation.
330
+
331
+ E-3 (distance: 0.946):
332
+ suspected anaphylactic reactions associated with anaesthesia # # summary ( 1 ) the aagbi has published guidance on management of anaphylaxis during anaesthesia in.
333
+
334
+ E-4 (distance: 0.952):
335
+ - gastrointestinal bleeding - perforated peptic ulcer - post - procedural or post - surgical - retroperitoneal hemorrhage - rupture ovarian cyst - trauma - distributive shock - sepsis - toxic shock syndrome - anaphylactic or anaphylactoid reaction - neurogenic shock - adrenal crisis # fire : focused initial rapid evaluation a focused initial rapid evaluation ( fire ) should be performed to identify patients in need of immediate intervention.
336
+
337
+ E-5 (distance: 0.954):
338
+ - surgical - retroperitoneal hemorrhage - rupture ovarian cyst - trauma - distributive shock - sepsis - toxic shock syndrome - anaphylactic or anaphylactoid reaction - neurogenic shock - adrenal crisis # fire : focused initial rapid evaluation a focused initial rapid evaluation ( fire ) should be performed to identify patients in need of immediate intervention.
339
+
340
+ Treatment Dataset Results:
341
+
342
+ T-1 (distance: 0.813):
343
+ ensitivity (anaphylactic) reactions require emergency treatment with epinephrine and other emergency measures, that may include airway management, oxygen, intravenous fluids, antihistamines, corticosteroids, and vasopressors as clinically indicated.
344
+
345
+ T-2 (distance: 0.833):
346
+ ave standard emergency treatments for hypersensitivity or anaphylactic reactions readily available in the operating room (e.
347
+
348
+ T-3 (distance: 0.838):
349
+ e, or systemic inflammation (anaphylactic shock).
350
+
351
+ T-4 (distance: 0.843):
352
+ ED AND APPROPRIATE THERAPY INSTITUTED.
353
+
354
+ T-5 (distance: 0.844):
355
+ UED AND APPROPRIATE THERAPY INSTITUTED.
tests/test_data_processing.py ADDED
@@ -0,0 +1,228 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Test script for data_processing.py
3
+
4
+ This script tests the basic functionality without running the full pipeline
5
+ to ensure everything is working correctly before proceeding with embedding generation.
6
+ """
7
+
8
+ import sys
9
+ from pathlib import Path
10
+ import pandas as pd
11
+
12
+ # Add src to path
13
+ sys.path.append(str(Path(__file__).parent.parent.resolve() / "src"))
14
+
15
+ from data_processing import DataProcessor
16
+ import logging
17
+
18
+ # Setup logging
19
+ logging.basicConfig(
20
+ level=logging.INFO,
21
+ format='%(levelname)s:%(name)s:%(message)s'
22
+ )
23
+ # Silence urllib3 logging
24
+ logging.getLogger('urllib3').setLevel(logging.WARNING)
25
+
26
+ logger = logging.getLogger(__name__)
27
+
28
+ def test_data_loading():
29
+ """Test data loading functionality"""
30
+ print("="*50)
31
+ print("TESTING DATA LOADING")
32
+ print("="*50)
33
+
34
+ try:
35
+ # Initialize processor with explicit base directory
36
+ base_dir = Path(__file__).parent.parent.resolve()
37
+ processor = DataProcessor(base_dir=str(base_dir))
38
+
39
+ # Test data loading
40
+ emergency_data, treatment_data = processor.load_filtered_data()
41
+
42
+ print(f"✅ Emergency data loaded: {len(emergency_data)} records")
43
+ print(f"✅ Treatment data loaded: {len(treatment_data)} records")
44
+
45
+ # Check data structure
46
+ print("\nEmergency data columns:", list(emergency_data.columns))
47
+ print("Treatment data columns:", list(treatment_data.columns))
48
+
49
+ # Show sample data
50
+ if len(emergency_data) > 0:
51
+ print(f"\nSample emergency matched keywords: {emergency_data['matched'].iloc[0]}")
52
+
53
+ if len(treatment_data) > 0:
54
+ print(f"Sample treatment matched keywords: {treatment_data['treatment_matched'].iloc[0]}")
55
+
56
+ return True
57
+
58
+ except Exception as e:
59
+ print(f"❌ Data loading failed: {e}")
60
+ return False
61
+
62
+ def test_chunking():
63
+ """Test chunking functionality"""
64
+ print("\n" + "="*50)
65
+ print("TESTING CHUNKING FUNCTIONALITY")
66
+ print("="*50)
67
+
68
+ try:
69
+ # Initialize processor
70
+ processor = DataProcessor()
71
+
72
+ # Load data
73
+ processor.load_filtered_data()
74
+
75
+ # Test emergency chunking (just first few records)
76
+ print("Testing emergency chunking...")
77
+ emergency_chunks = []
78
+ for idx, row in processor.emergency_data.head(3).iterrows():
79
+ if pd.notna(row.get('clean_text')) and pd.notna(row.get('matched')):
80
+ chunks = processor.create_keyword_centered_chunks(
81
+ text=row['clean_text'],
82
+ matched_keywords=row['matched'],
83
+ chunk_size=512,
84
+ doc_id=str(row.get('id', idx))
85
+ )
86
+ emergency_chunks.extend(chunks)
87
+
88
+ print(f"✅ Generated {len(emergency_chunks)} emergency chunks from 3 records")
89
+
90
+ # Test treatment chunking (just first few records)
91
+ print("Testing treatment chunking...")
92
+ treatment_chunks = []
93
+ for idx, row in processor.treatment_data.head(3).iterrows():
94
+ if (pd.notna(row.get('clean_text')) and
95
+ pd.notna(row.get('treatment_matched'))):
96
+ chunks = processor.create_dual_keyword_chunks(
97
+ text=row['clean_text'],
98
+ emergency_keywords=row.get('matched', ''),
99
+ treatment_keywords=row['treatment_matched'],
100
+ chunk_size=512,
101
+ doc_id=str(row.get('id', idx))
102
+ )
103
+ treatment_chunks.extend(chunks)
104
+
105
+ print(f"✅ Generated {len(treatment_chunks)} treatment chunks from 3 records")
106
+
107
+ # Show sample chunk
108
+ if emergency_chunks:
109
+ sample_chunk = emergency_chunks[0]
110
+ print(f"\nSample emergency chunk:")
111
+ print(f" Primary keyword: {sample_chunk['primary_keyword']}")
112
+ print(f" Text length: {len(sample_chunk['text'])}")
113
+ print(f" Text preview: {sample_chunk['text'][:100]}...")
114
+
115
+ if treatment_chunks:
116
+ sample_chunk = treatment_chunks[0]
117
+ print(f"\nSample treatment chunk:")
118
+ print(f" Primary keyword: {sample_chunk['primary_keyword']}")
119
+ print(f" Emergency keywords: {sample_chunk['emergency_keywords']}")
120
+ print(f" Text length: {len(sample_chunk['text'])}")
121
+ print(f" Text preview: {sample_chunk['text'][:100]}...")
122
+
123
+ return True
124
+
125
+ except Exception as e:
126
+ print(f"❌ Chunking test failed: {e}")
127
+ import traceback
128
+ traceback.print_exc()
129
+ return False
130
+
131
+ def test_model_loading():
132
+ """Test if we can load the embedding model"""
133
+ print("\n" + "="*50)
134
+ print("TESTING MODEL LOADING")
135
+ print("="*50)
136
+
137
+ try:
138
+ processor = DataProcessor()
139
+
140
+ print("Loading NeuML/pubmedbert-base-embeddings...")
141
+ model = processor.load_embedding_model()
142
+
143
+ print(f"✅ Model loaded successfully: {processor.embedding_model_name}")
144
+ print(f"✅ Model max sequence length: {model.max_seq_length}")
145
+
146
+ # Test a simple encoding
147
+ test_text = "Patient presents with chest pain and shortness of breath."
148
+ embedding = model.encode([test_text])
149
+
150
+ print(f"✅ Test embedding shape: {embedding.shape}")
151
+ print(f"✅ Expected dimension: {processor.embedding_dim}")
152
+
153
+ assert embedding.shape[1] == processor.embedding_dim, f"Dimension mismatch: {embedding.shape[1]} != {processor.embedding_dim}"
154
+
155
+ return True
156
+
157
+ except Exception as e:
158
+ print(f"❌ Model loading failed: {e}")
159
+ import traceback
160
+ traceback.print_exc()
161
+ return False
162
+
163
+ def test_token_chunking():
164
+ """Test token-based chunking functionality"""
165
+ try:
166
+ processor = DataProcessor()
167
+
168
+ test_text = "Patient presents with acute chest pain radiating to left arm. Initial ECG shows ST elevation."
169
+ test_keywords = "chest pain|ST elevation"
170
+
171
+ chunks = processor.create_keyword_centered_chunks(
172
+ text=test_text,
173
+ matched_keywords=test_keywords
174
+ )
175
+
176
+ print(f"\nToken chunking test:")
177
+ print(f"✓ Generated {len(chunks)} chunks")
178
+ for i, chunk in enumerate(chunks, 1):
179
+ print(f"\nChunk {i}:")
180
+ print(f" Primary keyword: {chunk['primary_keyword']}")
181
+ print(f" Content: {chunk['text']}")
182
+
183
+ return True
184
+
185
+ except Exception as e:
186
+ print(f"❌ Token chunking test failed: {e}")
187
+ return False
188
+
189
+ def main():
190
+ """Run all tests"""
191
+ print("Starting data processing tests...\n")
192
+
193
+ # Import pandas here since it's used in chunking test
194
+ import pandas as pd
195
+
196
+ tests = [
197
+ test_data_loading,
198
+ test_chunking,
199
+ test_model_loading,
200
+ test_token_chunking # Added new test
201
+ ]
202
+
203
+ results = []
204
+ for test in tests:
205
+ result = test()
206
+ results.append(result)
207
+
208
+ print("\n" + "="*50)
209
+ print("TEST SUMMARY")
210
+ print("="*50)
211
+
212
+ for i, (test, result) in enumerate(zip(tests, results), 1):
213
+ status = "✅ PASSED" if result else "❌ FAILED"
214
+ print(f"{i}. {test.__name__}: {status}")
215
+
216
+ all_passed = all(results)
217
+
218
+ if all_passed:
219
+ print("\n🎉 All tests passed! Ready to proceed with full pipeline.")
220
+ print("\nTo run the full data processing pipeline:")
221
+ print("cd FinalProject && python src/data_processing.py")
222
+ else:
223
+ print("\n⚠️ Some tests failed. Please check the issues above.")
224
+
225
+ return all_passed
226
+
227
+ if __name__ == "__main__":
228
+ main()
tests/test_embedding_and_index.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from annoy import AnnoyIndex
3
+ import pytest
4
+ from data_processing import DataProcessor
5
+
6
+ @pytest.fixture(scope="module")
7
+ def processor():
8
+ return DataProcessor(base_dir=".")
9
+
10
+ def test_embedding_dimensions(processor):
11
+ # load emergency embeddings
12
+ emb = np.load(processor.models_dir / "embeddings" / "emergency_embeddings.npy")
13
+ expected_dim = processor.embedding_dim
14
+ assert emb.ndim == 2, f"Expected 2D array, got {emb.ndim}D"
15
+ assert emb.shape[1] == expected_dim, (
16
+ f"Expected embedding dimension {expected_dim}, got {emb.shape[1]}"
17
+ )
18
+
19
+ def test_annoy_search(processor):
20
+ # load embeddings
21
+ emb = np.load(processor.models_dir / "embeddings" / "emergency_embeddings.npy")
22
+ # load Annoy index
23
+ idx = AnnoyIndex(processor.embedding_dim, 'angular')
24
+ idx.load(str(processor.models_dir / "indices" / "annoy" / "emergency_index.ann"))
25
+ # perform a sample query
26
+ query_vec = emb[0]
27
+ ids, distances = idx.get_nns_by_vector(query_vec, 5, include_distances=True)
28
+ assert len(ids) == 5
29
+ assert all(0 <= d <= 2 for d in distances)
tests/test_embedding_validation.py ADDED
@@ -0,0 +1,213 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Test suite for validating embeddings and ANNOY functionality.
3
+ This module ensures the quality of embeddings and the correctness of ANNOY search.
4
+ """
5
+
6
+ import numpy as np
7
+ import json
8
+ import logging
9
+ import os
10
+ from pathlib import Path
11
+ from typing import Tuple, List, Optional
12
+ from annoy import AnnoyIndex
13
+ from sentence_transformers import SentenceTransformer
14
+
15
+ class TestEmbeddingValidation:
16
+ def setup_class(self):
17
+ """Initialize test environment with necessary data and models."""
18
+ # Setup logging
19
+ logging.basicConfig(
20
+ level=logging.DEBUG,
21
+ format='%(asctime)s - %(levelname)s - %(message)s',
22
+ filename='embedding_validation.log'
23
+ )
24
+ self.logger = logging.getLogger(__name__)
25
+
26
+ # Define base paths
27
+ self.project_root = Path(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
28
+ self.models_dir = self.project_root / "models"
29
+ self.embeddings_dir = self.models_dir / "embeddings"
30
+ self.indices_dir = self.models_dir / "indices" / "annoy"
31
+
32
+ self.logger.info(f"Project root: {self.project_root}")
33
+ self.logger.info(f"Models directory: {self.models_dir}")
34
+ self.logger.info(f"Embeddings directory: {self.embeddings_dir}")
35
+
36
+ try:
37
+ # Check directory existence
38
+ if not self.embeddings_dir.exists():
39
+ raise FileNotFoundError(f"Embeddings directory not found at: {self.embeddings_dir}")
40
+ if not self.indices_dir.exists():
41
+ raise FileNotFoundError(f"Indices directory not found at: {self.indices_dir}")
42
+
43
+ # Load embeddings
44
+ self.emergency_emb = np.load(self.embeddings_dir / "emergency_embeddings.npy")
45
+ self.treatment_emb = np.load(self.embeddings_dir / "treatment_embeddings.npy")
46
+
47
+ # Load chunks
48
+ with open(self.embeddings_dir / "emergency_chunks.json", 'r') as f:
49
+ self.emergency_chunks = json.load(f)
50
+ with open(self.embeddings_dir / "treatment_chunks.json", 'r') as f:
51
+ self.treatment_chunks = json.load(f)
52
+
53
+ # Initialize model
54
+ self.model = SentenceTransformer("NeuML/pubmedbert-base-embeddings")
55
+
56
+ self.logger.info("Test environment initialized successfully")
57
+ self.logger.info(f"Emergency embeddings shape: {self.emergency_emb.shape}")
58
+ self.logger.info(f"Treatment embeddings shape: {self.treatment_emb.shape}")
59
+
60
+ except FileNotFoundError as e:
61
+ self.logger.error(f"File not found: {e}")
62
+ raise
63
+ except Exception as e:
64
+ self.logger.error(f"Error during initialization: {e}")
65
+ raise
66
+
67
+ def _safe_search(
68
+ self,
69
+ index: AnnoyIndex,
70
+ query_vector: np.ndarray,
71
+ k: int = 5
72
+ ) -> Tuple[Optional[List[int]], Optional[List[float]]]:
73
+ """Safe search wrapper with error handling"""
74
+ try:
75
+ indices, distances = index.get_nns_by_vector(
76
+ query_vector, k, include_distances=True
77
+ )
78
+ self.logger.debug(f"Search successful: found {len(indices)} results")
79
+ return indices, distances
80
+
81
+ except Exception as e:
82
+ self.logger.error(f"Search failed: {str(e)}")
83
+ return None, None
84
+
85
+ def test_embedding_dimensions(self):
86
+ """Test embedding dimensions and data quality."""
87
+ self.logger.info("\n=== Embedding Validation Report ===")
88
+
89
+ try:
90
+ # Basic dimension checks
91
+ assert self.emergency_emb.shape[1] == 768, "Emergency embedding dimension should be 768"
92
+ assert self.treatment_emb.shape[1] == 768, "Treatment embedding dimension should be 768"
93
+
94
+ # Count verification
95
+ assert len(self.emergency_chunks) == self.emergency_emb.shape[0], \
96
+ "Emergency chunks count mismatch"
97
+ assert len(self.treatment_chunks) == self.treatment_emb.shape[0], \
98
+ "Treatment chunks count mismatch"
99
+
100
+ # Data quality checks
101
+ for name, emb in [("Emergency", self.emergency_emb),
102
+ ("Treatment", self.treatment_emb)]:
103
+ # Check for NaN and Inf
104
+ assert not np.isnan(emb).any(), f"{name} contains NaN values"
105
+ assert not np.isinf(emb).any(), f"{name} contains Inf values"
106
+
107
+ # Value distribution analysis
108
+ self.logger.info(f"\n{name} Embeddings Statistics:")
109
+ self.logger.info(f"- Range: {np.min(emb):.3f} to {np.max(emb):.3f}")
110
+ self.logger.info(f"- Mean: {np.mean(emb):.3f}")
111
+ self.logger.info(f"- Std: {np.std(emb):.3f}")
112
+
113
+ self.logger.info("\n✅ All embedding validations passed")
114
+
115
+ except AssertionError as e:
116
+ self.logger.error(f"Validation failed: {str(e)}")
117
+ raise
118
+
119
+ def test_multiple_known_item_search(self):
120
+ """Test ANNOY search with multiple random samples."""
121
+ self.logger.info("\n=== Multiple Known-Item Search Test ===")
122
+
123
+ emergency_index = AnnoyIndex(768, 'angular')
124
+ emergency_index.load(str(self.indices_dir / "emergency_index.ann"))
125
+
126
+ # Test 20 random samples
127
+ test_indices = np.random.choice(
128
+ self.emergency_emb.shape[0],
129
+ size=20,
130
+ replace=False
131
+ )
132
+
133
+ success_count = 0
134
+ for test_idx in test_indices:
135
+ try:
136
+ test_emb = self.emergency_emb[test_idx]
137
+ indices, distances = self._safe_search(emergency_index, test_emb)
138
+
139
+ if indices is None:
140
+ continue
141
+
142
+ # Verify self-retrieval
143
+ assert indices[0] == test_idx, f"Self-retrieval failed for index {test_idx}"
144
+ assert distances[0] < 0.0001, f"Self-distance too large for index {test_idx}"
145
+ success_count += 1
146
+
147
+ except AssertionError as e:
148
+ self.logger.warning(f"Test failed for index {test_idx}: {str(e)}")
149
+
150
+ self.logger.info(f"\n✅ {success_count}/20 self-retrieval tests passed")
151
+ assert success_count >= 18, "Less than 90% of self-retrieval tests passed"
152
+
153
+ def test_balanced_cross_dataset_search(self):
154
+ """Test search across both emergency and treatment datasets."""
155
+ self.logger.info("\n=== Balanced Cross-Dataset Search Test ===")
156
+
157
+ # Initialize indices
158
+ emergency_index = AnnoyIndex(768, 'angular')
159
+ treatment_index = AnnoyIndex(768, 'angular')
160
+
161
+ try:
162
+ emergency_index.load(str(self.indices_dir / "emergency_index.ann"))
163
+ treatment_index.load(str(self.indices_dir / "treatment_index.ann"))
164
+
165
+ # Test queries
166
+ test_queries = [
167
+ "What is the treatment protocol for acute myocardial infarction?",
168
+ "How to manage severe chest pain with difficulty breathing?",
169
+ "What are the emergency procedures for anaphylactic shock?"
170
+ ]
171
+
172
+ for query in test_queries:
173
+ print(f"\n\n=== Query: {query} ===")
174
+
175
+ # Generate query vector
176
+ query_emb = self.model.encode([query])[0]
177
+
178
+ # Get top-5 results from each dataset
179
+ e_indices, e_distances = self._safe_search(emergency_index, query_emb, k=5)
180
+ t_indices, t_distances = self._safe_search(treatment_index, query_emb, k=5)
181
+
182
+ if None in [e_indices, e_distances, t_indices, t_distances]:
183
+ self.logger.error("Search failed for one or both datasets")
184
+ continue
185
+
186
+ # Print first sentence of each result
187
+ print("\nEmergency Dataset Results:")
188
+ for i, (idx, dist) in enumerate(zip(e_indices, e_distances), 1):
189
+ text = self.emergency_chunks[idx]['text']
190
+ first_sentence = text.split('.')[0] + '.'
191
+ print(f"\nE-{i} (distance: {dist:.3f}):")
192
+ print(first_sentence)
193
+
194
+ print("\nTreatment Dataset Results:")
195
+ for i, (idx, dist) in enumerate(zip(t_indices, t_distances), 1):
196
+ text = self.treatment_chunks[idx]['text']
197
+ first_sentence = text.split('.')[0] + '.'
198
+ print(f"\nT-{i} (distance: {dist:.3f}):")
199
+ print(first_sentence)
200
+
201
+ except Exception as e:
202
+ self.logger.error(f"Test failed: {str(e)}")
203
+ raise
204
+ else:
205
+ self.logger.info("\n✅ Cross-dataset search test completed")
206
+
207
+ if __name__ == "__main__":
208
+ # Manual test execution
209
+ test = TestEmbeddingValidation()
210
+ test.setup_class()
211
+ test.test_embedding_dimensions()
212
+ test.test_multiple_known_item_search()
213
+ test.test_balanced_cross_dataset_search()