Yan-Bo Chen commited on
Commit
cd2cfdd
·
2 Parent(s): 21e5bc6 1e4fdbf

Merge pull request #1 from YanBoChen0928/dataprocessing

Browse files

s/p Data preprocessing, -> go for processing, embedding...

Files changed (28) hide show
  1. .gitignore +1 -1
  2. dataset/analysis/integrity_check/integrity_check_report.json +29 -0
  3. dataset/analysis/keyword_matching_test_results.json +151 -0
  4. dataset/analysis/stats/analysis_stats_emergency_subset.json +55 -0
  5. dataset/analysis/stats/analysis_stats_emergency_subset_opt.json +55 -0
  6. dataset/analysis/subset_comparison_first10_records_20250726_163149.md +198 -0
  7. dataset/analysis/subset_comparison_first10_records_20250726_163158.md +198 -0
  8. dataset/analysis_treatment/stats/treatment_analysis_comprehensive.json +293 -0
  9. dataset/check_source.py +18 -0
  10. dataset/filter_guidelines.py +31 -0
  11. dataset/keywords/emergency_keywords.txt +47 -0
  12. dataset/keywords/special_terms_emergency.json +26 -0
  13. dataset/keywords/special_terms_treatment.json +25 -0
  14. dataset/keywords/treatment_keywords.txt +105 -0
  15. dataset/scripts/01_filter_emergency.py +58 -0
  16. dataset/scripts/01_filter_emergency_opt.py +112 -0
  17. dataset/scripts/02_filter_treatment.py +103 -0
  18. dataset/scripts/02_filter_treatment_opt.py +131 -0
  19. dataset/scripts/check_subset_integrity.py +178 -0
  20. dataset/scripts/commit_message_20250726_special_terms.txt +39 -0
  21. dataset/scripts/compare_subsets_opt.py +124 -0
  22. dataset/scripts/data_explorer.py +123 -0
  23. dataset/scripts/data_explorer_opt.py +118 -0
  24. dataset/scripts/data_explorer_treatment.py +265 -0
  25. dataset/scripts/data_explorer_treatment_opt.py +262 -0
  26. dataset/scripts/keyword_Match_Clean_for_subset_filter.txt +85 -0
  27. dataset/scripts/test_keyword_matching.py +175 -0
  28. requirements.txt +7 -0
.gitignore CHANGED
@@ -1,4 +1,4 @@
1
- dataset/
2
 
3
  #virtual environment
4
  genAIvenv/
 
1
+ dataset/dataset/
2
 
3
  #virtual environment
4
  genAIvenv/
dataset/analysis/integrity_check/integrity_check_report.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "sample_analysis": {
3
+ "matched": {
4
+ "non_null": 100,
5
+ "non_empty": 100,
6
+ "unique_values": 84
7
+ },
8
+ "treatment_matched": {
9
+ "non_null": 100,
10
+ "non_empty": 100,
11
+ "unique_values": 100
12
+ }
13
+ },
14
+ "full_file_analysis": {
15
+ "total_records": 9367,
16
+ "matched_column": {
17
+ "non_null_count": 9367,
18
+ "non_empty_count": 9367,
19
+ "null_percentage": 0.0
20
+ },
21
+ "treatment_matched_column": {
22
+ "non_null_count": 9367,
23
+ "non_empty_count": 9367,
24
+ "null_percentage": 0.0
25
+ },
26
+ "both_matched_count": 3315,
27
+ "both_matched_percentage": 35.39019963702359
28
+ }
29
+ }
dataset/analysis/keyword_matching_test_results.json ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "special_terms_matching": [
3
+ {
4
+ "clean_text": "Patient needs an x-ray of the chest",
5
+ "category": "x-ray variants",
6
+ "matched": "x-ray"
7
+ },
8
+ {
9
+ "clean_text": "Ordered chest xray",
10
+ "category": "x-ray variants",
11
+ "matched": "xray"
12
+ },
13
+ {
14
+ "clean_text": "X ray shows pneumonia",
15
+ "category": "x-ray variants",
16
+ "matched": "X ray"
17
+ },
18
+ {
19
+ "clean_text": "XRAY negative",
20
+ "category": "x-ray variants",
21
+ "matched": "XRAY"
22
+ },
23
+ {
24
+ "clean_text": "CT scan reveals nodule",
25
+ "category": "ct-scan variants",
26
+ "matched": "CT scan"
27
+ },
28
+ {
29
+ "clean_text": "CT-scan indicates mass",
30
+ "category": "ct-scan variants",
31
+ "matched": "CT-scan"
32
+ },
33
+ {
34
+ "clean_text": "Requires ctscan urgently",
35
+ "category": "ct-scan variants",
36
+ "matched": "ctscan"
37
+ },
38
+ {
39
+ "clean_text": "CTSCAN of abdomen",
40
+ "category": "ct-scan variants",
41
+ "matched": "CTSCAN"
42
+ },
43
+ {
44
+ "clean_text": "Point-of-care testing needed",
45
+ "category": "point-of-care variants",
46
+ "matched": "Point-of-care"
47
+ },
48
+ {
49
+ "clean_text": "Point of care ultrasound",
50
+ "category": "point-of-care variants",
51
+ "matched": "Point of care"
52
+ },
53
+ {
54
+ "clean_text": "POC testing results",
55
+ "category": "point-of-care variants",
56
+ "matched": ""
57
+ },
58
+ {
59
+ "clean_text": "Ordered both x-ray and CT scan",
60
+ "category": "mixed cases",
61
+ "matched": "x-ray|CT scan"
62
+ },
63
+ {
64
+ "clean_text": "XRAY and CTSCAN negative",
65
+ "category": "mixed cases",
66
+ "matched": "XRAY|CTSCAN"
67
+ },
68
+ {
69
+ "clean_text": "Multiple point-of-care tests with x-ray",
70
+ "category": "mixed cases",
71
+ "matched": "point-of-care|x-ray"
72
+ },
73
+ {
74
+ "clean_text": "No imaging mentioned",
75
+ "category": "negative cases",
76
+ "matched": ""
77
+ },
78
+ {
79
+ "clean_text": "Regular examination only",
80
+ "category": "negative cases",
81
+ "matched": ""
82
+ },
83
+ {
84
+ "clean_text": "Laboratory tests pending",
85
+ "category": "negative cases",
86
+ "matched": ""
87
+ }
88
+ ],
89
+ "basic_matching": [
90
+ {
91
+ "clean_text": "Emergency treatment required",
92
+ "category": "simple matches",
93
+ "matched": "Emergency"
94
+ },
95
+ {
96
+ "clean_text": "Acute condition observed",
97
+ "category": "simple matches",
98
+ "matched": "Acute"
99
+ },
100
+ {
101
+ "clean_text": "Urgent care needed",
102
+ "category": "simple matches",
103
+ "matched": "Urgent"
104
+ },
105
+ {
106
+ "clean_text": "EMERGENCY situation",
107
+ "category": "case variations",
108
+ "matched": "EMERGENCY"
109
+ },
110
+ {
111
+ "clean_text": "Acute RESPIRATORY failure",
112
+ "category": "case variations",
113
+ "matched": "Acute"
114
+ },
115
+ {
116
+ "clean_text": "URgent surgical intervention",
117
+ "category": "case variations",
118
+ "matched": "URgent"
119
+ },
120
+ {
121
+ "clean_text": "Emergency treatment for acute condition",
122
+ "category": "multiple matches",
123
+ "matched": "Emergency|acute"
124
+ },
125
+ {
126
+ "clean_text": "Urgent care in emergency department",
127
+ "category": "multiple matches",
128
+ "matched": "Urgent|emergency"
129
+ },
130
+ {
131
+ "clean_text": "Acute respiratory emergency",
132
+ "category": "multiple matches",
133
+ "matched": "Acute|emergency"
134
+ },
135
+ {
136
+ "clean_text": "Non-emergency situation",
137
+ "category": "partial words",
138
+ "matched": "emergency"
139
+ },
140
+ {
141
+ "clean_text": "Subacute condition",
142
+ "category": "partial words",
143
+ "matched": ""
144
+ },
145
+ {
146
+ "clean_text": "Emergency-related",
147
+ "category": "partial words",
148
+ "matched": "Emergency"
149
+ }
150
+ ]
151
+ }
dataset/analysis/stats/analysis_stats_emergency_subset.json ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "basic_statistics": {
3
+ "total_records": 10282,
4
+ "avg_length": 25185.078194903715
5
+ },
6
+ "keyword_statistics": {
7
+ "Acute abdomen": 52,
8
+ "Acute bleeding": 31,
9
+ "Acute Coronary Syndrome": 345,
10
+ "Acute Kidney Injury": 202,
11
+ "Acute pancreatitis": 214,
12
+ "Acute respiratory distress syndrome": 231,
13
+ "Acute stroke": 67,
14
+ "Anaphylaxis": 1016,
15
+ "Anaphylactic Shock": 153,
16
+ "Arrhythmia": 1547,
17
+ "Atrial fibrillation": 771,
18
+ "Atrial flutter": 146,
19
+ "Bradycardia": 884,
20
+ "Cardiac arrest": 614,
21
+ "Cardiogenic Shock": 196,
22
+ "Chest pain": 1433,
23
+ "Dyspnea": 1319,
24
+ "Fever": 4270,
25
+ "Gastrointestinal Hemorrhage": 158,
26
+ "GI bleeding": 105,
27
+ "Hemorrhage": 1611,
28
+ "Hemorrhagic stroke": 117,
29
+ "Hyperthermia": 305,
30
+ "Hypovolemic Shock": 63,
31
+ "Hypotension": 1929,
32
+ "Hypothermia": 356,
33
+ "Internal bleeding": 70,
34
+ "Intracranial Hemorrhages": 6,
35
+ "Ischemic stroke": 224,
36
+ "Loss of consciousness": 422,
37
+ "Myocardial Infarction": 1708,
38
+ "MI": 10183,
39
+ "Pulmonary Edema": 487,
40
+ "Pulmonary Embolism": 654,
41
+ "Respiratory distress": 730,
42
+ "Respiratory failure": 579,
43
+ "Sepsis": 1181,
44
+ "Severe Sepsis": 81,
45
+ "Septic Shock": 244,
46
+ "Shock": 1881,
47
+ "Status Epilepticus": 150,
48
+ "Syncope": 834,
49
+ "Tachycardia": 1650,
50
+ "Tachypnea": 268,
51
+ "Traumatic Brain Injury": 171,
52
+ "Ventricular Tachycardia": 491,
53
+ "Ventricular fibrillation": 295
54
+ }
55
+ }
dataset/analysis/stats/analysis_stats_emergency_subset_opt.json ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "basic_statistics": {
3
+ "total_records": 11914,
4
+ "avg_length": 23847.07579318449
5
+ },
6
+ "keyword_statistics": {
7
+ "Acute abdomen": 52,
8
+ "Acute bleeding": 31,
9
+ "Acute Coronary Syndrome": 351,
10
+ "Acute Kidney Injury": 202,
11
+ "Acute pancreatitis": 214,
12
+ "Acute respiratory distress syndrome": 231,
13
+ "Acute stroke": 67,
14
+ "Anaphylaxis": 1016,
15
+ "Anaphylactic Shock": 153,
16
+ "Arrhythmia": 1564,
17
+ "Atrial fibrillation": 771,
18
+ "Atrial flutter": 146,
19
+ "Bradycardia": 884,
20
+ "Cardiac arrest": 614,
21
+ "Cardiogenic Shock": 196,
22
+ "Chest pain": 1434,
23
+ "Dyspnea": 1319,
24
+ "Fever": 4279,
25
+ "Gastrointestinal Hemorrhage": 158,
26
+ "GI bleeding": 105,
27
+ "Hemorrhage": 1621,
28
+ "Hemorrhagic stroke": 117,
29
+ "Hyperthermia": 305,
30
+ "Hypovolemic Shock": 63,
31
+ "Hypotension": 1929,
32
+ "Hypothermia": 356,
33
+ "Internal bleeding": 70,
34
+ "Intracranial Hemorrhages": 6,
35
+ "Ischemic stroke": 225,
36
+ "Loss of consciousness": 422,
37
+ "Myocardial Infarction": 1710,
38
+ "MI": 11773,
39
+ "Pulmonary Edema": 487,
40
+ "Pulmonary Embolism": 654,
41
+ "Respiratory distress": 730,
42
+ "Respiratory failure": 579,
43
+ "Sepsis": 1188,
44
+ "Severe Sepsis": 81,
45
+ "Septic Shock": 244,
46
+ "Shock": 1892,
47
+ "Status Epilepticus": 150,
48
+ "Syncope": 834,
49
+ "Tachycardia": 1651,
50
+ "Tachypnea": 268,
51
+ "Traumatic Brain Injury": 171,
52
+ "Ventricular Tachycardia": 492,
53
+ "Ventricular fibrillation": 295
54
+ }
55
+ }
dataset/analysis/subset_comparison_first10_records_20250726_163149.md ADDED
@@ -0,0 +1,198 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Optimized Subsets Comparison Report
2
+
3
+ Generated on: 2025-07-26 16:31:49
4
+
5
+ File format: CSV
6
+
7
+
8
+ ## Basic Statistics
9
+
10
+ - Emergency subset total records: 11914
11
+ - Emergency+Treatment subset total records: 11023
12
+ - Avg Emergency Text Length: 23847.08
13
+ - Avg Treatment Text Length: 25408.64
14
+ - Avg Emergency Keywords: 2.85
15
+ - Avg Treatment Keywords: 2.97
16
+
17
+ ## Emergency Subset (First 10 Records)
18
+
19
+
20
+ ### Record 1
21
+ ```
22
+ Text preview: # Section 1: Recommendations
23
+
24
+ # RECOMMENDATIONS Recommendation 1: General Measures Committee Respons...
25
+ Matched keywords: shock
26
+ Text length: 37792
27
+ ```
28
+
29
+
30
+ ### Record 2
31
+ ```
32
+ Text preview: Evidence-based Series 4-9 Version 2 A Quality Initiative of the Program in Evidence-based Care (PEBC...
33
+ Matched keywords: hemorrhage
34
+ Text length: 7559
35
+ ```
36
+
37
+
38
+ ### Record 3
39
+ ```
40
+ Text preview: Neuroendocrine tumours (NETs) constitute a heterogeneous group of neoplasms: they include epithelial...
41
+ Matched keywords: ards|pulmonary embolism
42
+ Text length: 11731
43
+ ```
44
+
45
+
46
+ ### Record 4
47
+ ```
48
+ Text preview: Given the potential toxicities associated with alemtuzumab, and given the limited nature of the clin...
49
+ Matched keywords: fever|dyspnea|hypotension|sepsis
50
+ Text length: 46087
51
+ ```
52
+
53
+
54
+ ### Record 5
55
+ ```
56
+ Text preview: Although the incidence and mortality of gastric cancer has been steadily decreasing in Canadian men ...
57
+ Matched keywords: hyperthermia
58
+ Text length: 35302
59
+ ```
60
+
61
+
62
+ ### Record 6
63
+ ```
64
+ Text preview: There are various definitions for palliative care, but most people would agree that "it focuses on c...
65
+ Matched keywords: hemorrhage|dyspnea
66
+ Text length: 16186
67
+ ```
68
+
69
+
70
+ ### Record 7
71
+ ```
72
+ Text preview: # GUIDELINE OBJECTIVES
73
+ The objective of this guideline is to update a previous guideline on chemothe...
74
+ Matched keywords: hemorrhage
75
+ Text length: 7551
76
+ ```
77
+
78
+
79
+ ### Record 8
80
+ ```
81
+ Text preview: Anthracyclines have been established to be superior to some non-anthracycline chemotherapy regimens ...
82
+ Matched keywords: mi
83
+ Text length: 50729
84
+ ```
85
+
86
+
87
+ ### Record 9
88
+ ```
89
+ Text preview: # GUIDELINE OBJECTIVE
90
+ This guideline was written to provide guidance on the most appropriate follow-...
91
+ Matched keywords: hemorrhage
92
+ Text length: 4299
93
+ ```
94
+
95
+
96
+ ### Record 10
97
+ ```
98
+ Text preview: PDT is a local treatment. It utilizes the local, selective, cytotoxic reaction produced by photosens...
99
+ Matched keywords: dyspnea|mi|hemorrhage|respiratory_failure|cva|hypotension|sepsis|ards
100
+ Text length: 54427
101
+ ```
102
+
103
+
104
+ ## Emergency+Treatment Subset (First 10 Records)
105
+
106
+
107
+ ### Record 1
108
+ ```
109
+ Text preview: # Section 1: Recommendations
110
+
111
+ # RECOMMENDATIONS Recommendation 1: General Measures Committee Respons...
112
+ Emergency keywords: shock
113
+ Treatment keywords: management|medication|procedure|fluid|monitoring|iv|administer|dose
114
+ Text length: 37792
115
+ ```
116
+
117
+
118
+ ### Record 2
119
+ ```
120
+ Text preview: Evidence-based Series 4-9 Version 2 A Quality Initiative of the Program in Evidence-based Care (PEBC...
121
+ Emergency keywords: hemorrhage
122
+ Treatment keywords: Therapy|treatment|x-ray|us|ct
123
+ Text length: 7559
124
+ ```
125
+
126
+
127
+ ### Record 3
128
+ ```
129
+ Text preview: Neuroendocrine tumours (NETs) constitute a heterogeneous group of neoplasms: they include epithelial...
130
+ Emergency keywords: ards|pulmonary embolism
131
+ Treatment keywords: dopamine|therapy|treatment|surgery|iv|intervention|dose
132
+ Text length: 11731
133
+ ```
134
+
135
+
136
+ ### Record 4
137
+ ```
138
+ Text preview: Given the potential toxicities associated with alemtuzumab, and given the limited nature of the clin...
139
+ Emergency keywords: fever|dyspnea|hypotension|sepsis
140
+ Treatment keywords: treatment|iv|therapy|treat|management|intervention|supportive care|dose
141
+ Text length: 46087
142
+ ```
143
+
144
+
145
+ ### Record 5
146
+ ```
147
+ Text preview: Although the incidence and mortality of gastric cancer has been steadily decreasing in Canadian men ...
148
+ Emergency keywords: hyperthermia
149
+ Treatment keywords: surgery|treatment|therapy|treat|dose|ct
150
+ Text length: 35302
151
+ ```
152
+
153
+
154
+ ### Record 6
155
+ ```
156
+ Text preview: There are various definitions for palliative care, but most people would agree that "it focuses on c...
157
+ Emergency keywords: hemorrhage|dyspnea
158
+ Treatment keywords: therapy|management|treatment|morphine|dose
159
+ Text length: 16186
160
+ ```
161
+
162
+
163
+ ### Record 7
164
+ ```
165
+ Text preview: # GUIDELINE OBJECTIVES
166
+ The objective of this guideline is to update a previous guideline on chemothe...
167
+ Emergency keywords: hemorrhage
168
+ Treatment keywords: therapy|treatment|surgery
169
+ Text length: 7551
170
+ ```
171
+
172
+
173
+ ### Record 8
174
+ ```
175
+ Text preview: Anthracyclines have been established to be superior to some non-anthracycline chemotherapy regimens ...
176
+ Emergency keywords: mi
177
+ Treatment keywords: iv|Dose|therapy|administer|surgery|treatment|treat|medication|ecg
178
+ Text length: 50729
179
+ ```
180
+
181
+
182
+ ### Record 9
183
+ ```
184
+ Text preview: # GUIDELINE OBJECTIVE
185
+ This guideline was written to provide guidance on the most appropriate follow-...
186
+ Emergency keywords: hemorrhage
187
+ Treatment keywords: treatment|ct
188
+ Text length: 4299
189
+ ```
190
+
191
+
192
+ ### Record 10
193
+ ```
194
+ Text preview: PDT is a local treatment. It utilizes the local, selective, cytotoxic reaction produced by photosens...
195
+ Emergency keywords: dyspnea|mi|hemorrhage|respiratory_failure|cva|hypotension|sepsis|ards
196
+ Treatment keywords: treatment|oxygen|iv|dose|therapy|surgery|x-ray|administer|procedure|management
197
+ Text length: 54427
198
+ ```
dataset/analysis/subset_comparison_first10_records_20250726_163158.md ADDED
@@ -0,0 +1,198 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Optimized Subsets Comparison Report
2
+
3
+ Generated on: 2025-07-26 16:31:58
4
+
5
+ File format: JSONL
6
+
7
+
8
+ ## Basic Statistics
9
+
10
+ - Emergency subset total records: 11914
11
+ - Emergency+Treatment subset total records: 11023
12
+ - Avg Emergency Text Length: 23847.08
13
+ - Avg Treatment Text Length: 25408.64
14
+ - Avg Emergency Keywords: 2.85
15
+ - Avg Treatment Keywords: 2.97
16
+
17
+ ## Emergency Subset (First 10 Records)
18
+
19
+
20
+ ### Record 1
21
+ ```
22
+ Text preview: # Section 1: Recommendations
23
+
24
+ # RECOMMENDATIONS Recommendation 1: General Measures Committee Respons...
25
+ Matched keywords: shock
26
+ Text length: 37792
27
+ ```
28
+
29
+
30
+ ### Record 2
31
+ ```
32
+ Text preview: Evidence-based Series 4-9 Version 2 A Quality Initiative of the Program in Evidence-based Care (PEBC...
33
+ Matched keywords: hemorrhage
34
+ Text length: 7559
35
+ ```
36
+
37
+
38
+ ### Record 3
39
+ ```
40
+ Text preview: Neuroendocrine tumours (NETs) constitute a heterogeneous group of neoplasms: they include epithelial...
41
+ Matched keywords: ards|pulmonary embolism
42
+ Text length: 11731
43
+ ```
44
+
45
+
46
+ ### Record 4
47
+ ```
48
+ Text preview: Given the potential toxicities associated with alemtuzumab, and given the limited nature of the clin...
49
+ Matched keywords: fever|dyspnea|hypotension|sepsis
50
+ Text length: 46087
51
+ ```
52
+
53
+
54
+ ### Record 5
55
+ ```
56
+ Text preview: Although the incidence and mortality of gastric cancer has been steadily decreasing in Canadian men ...
57
+ Matched keywords: hyperthermia
58
+ Text length: 35302
59
+ ```
60
+
61
+
62
+ ### Record 6
63
+ ```
64
+ Text preview: There are various definitions for palliative care, but most people would agree that "it focuses on c...
65
+ Matched keywords: hemorrhage|dyspnea
66
+ Text length: 16186
67
+ ```
68
+
69
+
70
+ ### Record 7
71
+ ```
72
+ Text preview: # GUIDELINE OBJECTIVES
73
+ The objective of this guideline is to update a previous guideline on chemothe...
74
+ Matched keywords: hemorrhage
75
+ Text length: 7551
76
+ ```
77
+
78
+
79
+ ### Record 8
80
+ ```
81
+ Text preview: Anthracyclines have been established to be superior to some non-anthracycline chemotherapy regimens ...
82
+ Matched keywords: mi
83
+ Text length: 50729
84
+ ```
85
+
86
+
87
+ ### Record 9
88
+ ```
89
+ Text preview: # GUIDELINE OBJECTIVE
90
+ This guideline was written to provide guidance on the most appropriate follow-...
91
+ Matched keywords: hemorrhage
92
+ Text length: 4299
93
+ ```
94
+
95
+
96
+ ### Record 10
97
+ ```
98
+ Text preview: PDT is a local treatment. It utilizes the local, selective, cytotoxic reaction produced by photosens...
99
+ Matched keywords: dyspnea|mi|hemorrhage|respiratory_failure|cva|hypotension|sepsis|ards
100
+ Text length: 54427
101
+ ```
102
+
103
+
104
+ ## Emergency+Treatment Subset (First 10 Records)
105
+
106
+
107
+ ### Record 1
108
+ ```
109
+ Text preview: # Section 1: Recommendations
110
+
111
+ # RECOMMENDATIONS Recommendation 1: General Measures Committee Respons...
112
+ Emergency keywords: shock
113
+ Treatment keywords: management|medication|procedure|fluid|monitoring|iv|administer|dose
114
+ Text length: 37792
115
+ ```
116
+
117
+
118
+ ### Record 2
119
+ ```
120
+ Text preview: Evidence-based Series 4-9 Version 2 A Quality Initiative of the Program in Evidence-based Care (PEBC...
121
+ Emergency keywords: hemorrhage
122
+ Treatment keywords: Therapy|treatment|x-ray|us|ct
123
+ Text length: 7559
124
+ ```
125
+
126
+
127
+ ### Record 3
128
+ ```
129
+ Text preview: Neuroendocrine tumours (NETs) constitute a heterogeneous group of neoplasms: they include epithelial...
130
+ Emergency keywords: ards|pulmonary embolism
131
+ Treatment keywords: dopamine|therapy|treatment|surgery|iv|intervention|dose
132
+ Text length: 11731
133
+ ```
134
+
135
+
136
+ ### Record 4
137
+ ```
138
+ Text preview: Given the potential toxicities associated with alemtuzumab, and given the limited nature of the clin...
139
+ Emergency keywords: fever|dyspnea|hypotension|sepsis
140
+ Treatment keywords: treatment|iv|therapy|treat|management|intervention|supportive care|dose
141
+ Text length: 46087
142
+ ```
143
+
144
+
145
+ ### Record 5
146
+ ```
147
+ Text preview: Although the incidence and mortality of gastric cancer has been steadily decreasing in Canadian men ...
148
+ Emergency keywords: hyperthermia
149
+ Treatment keywords: surgery|treatment|therapy|treat|dose|ct
150
+ Text length: 35302
151
+ ```
152
+
153
+
154
+ ### Record 6
155
+ ```
156
+ Text preview: There are various definitions for palliative care, but most people would agree that "it focuses on c...
157
+ Emergency keywords: hemorrhage|dyspnea
158
+ Treatment keywords: therapy|management|treatment|morphine|dose
159
+ Text length: 16186
160
+ ```
161
+
162
+
163
+ ### Record 7
164
+ ```
165
+ Text preview: # GUIDELINE OBJECTIVES
166
+ The objective of this guideline is to update a previous guideline on chemothe...
167
+ Emergency keywords: hemorrhage
168
+ Treatment keywords: therapy|treatment|surgery
169
+ Text length: 7551
170
+ ```
171
+
172
+
173
+ ### Record 8
174
+ ```
175
+ Text preview: Anthracyclines have been established to be superior to some non-anthracycline chemotherapy regimens ...
176
+ Emergency keywords: mi
177
+ Treatment keywords: iv|Dose|therapy|administer|surgery|treatment|treat|medication|ecg
178
+ Text length: 50729
179
+ ```
180
+
181
+
182
+ ### Record 9
183
+ ```
184
+ Text preview: # GUIDELINE OBJECTIVE
185
+ This guideline was written to provide guidance on the most appropriate follow-...
186
+ Emergency keywords: hemorrhage
187
+ Treatment keywords: treatment|ct
188
+ Text length: 4299
189
+ ```
190
+
191
+
192
+ ### Record 10
193
+ ```
194
+ Text preview: PDT is a local treatment. It utilizes the local, selective, cytotoxic reaction produced by photosens...
195
+ Emergency keywords: dyspnea|mi|hemorrhage|respiratory_failure|cva|hypotension|sepsis|ards
196
+ Treatment keywords: treatment|oxygen|iv|dose|therapy|surgery|x-ray|administer|procedure|management
197
+ Text length: 54427
198
+ ```
dataset/analysis_treatment/stats/treatment_analysis_comprehensive.json ADDED
@@ -0,0 +1,293 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "basic_statistics": {
3
+ "total_records": 9367,
4
+ "avg_text_length": 27179.22952919825,
5
+ "emergency_keywords_count": 47,
6
+ "treatment_keywords_count": 105
7
+ },
8
+ "emergency_keyword_stats": {
9
+ "Acute abdomen": 51,
10
+ "Acute bleeding": 31,
11
+ "Acute Coronary Syndrome": 332,
12
+ "Acute Kidney Injury": 200,
13
+ "Acute pancreatitis": 202,
14
+ "Acute respiratory distress syndrome": 225,
15
+ "Acute stroke": 65,
16
+ "Anaphylaxis": 1002,
17
+ "Anaphylactic Shock": 148,
18
+ "Arrhythmia": 1490,
19
+ "Atrial fibrillation": 736,
20
+ "Atrial flutter": 139,
21
+ "Bradycardia": 845,
22
+ "Cardiac arrest": 600,
23
+ "Cardiogenic Shock": 192,
24
+ "Chest pain": 1408,
25
+ "Dyspnea": 1296,
26
+ "Fever": 4008,
27
+ "Gastrointestinal Hemorrhage": 158,
28
+ "GI bleeding": 103,
29
+ "Hemorrhage": 1532,
30
+ "Hemorrhagic stroke": 109,
31
+ "Hyperthermia": 283,
32
+ "Hypovolemic Shock": 61,
33
+ "Hypotension": 1897,
34
+ "Hypothermia": 340,
35
+ "Internal bleeding": 67,
36
+ "Intracranial Hemorrhages": 5,
37
+ "Ischemic stroke": 216,
38
+ "Loss of consciousness": 406,
39
+ "Myocardial Infarction": 1607,
40
+ "MI": 9316,
41
+ "Pulmonary Edema": 471,
42
+ "Pulmonary Embolism": 624,
43
+ "Respiratory distress": 713,
44
+ "Respiratory failure": 554,
45
+ "Sepsis": 1145,
46
+ "Severe Sepsis": 81,
47
+ "Septic Shock": 231,
48
+ "Shock": 1702,
49
+ "Status Epilepticus": 149,
50
+ "Syncope": 806,
51
+ "Tachycardia": 1576,
52
+ "Tachypnea": 262,
53
+ "Traumatic Brain Injury": 151,
54
+ "Ventricular Tachycardia": 461,
55
+ "Ventricular fibrillation": 280
56
+ },
57
+ "treatment_keyword_stats": {
58
+ "ACLS": 30,
59
+ "administer": 3881,
60
+ "Adrenaline": 135,
61
+ "Advanced Cardiac Life Support": 34,
62
+ "Airway Management": 174,
63
+ "alpha blocker": 35,
64
+ "Amiodarone": 315,
65
+ "analgesia": 323,
66
+ "Anesthesia Procedural": 0,
67
+ "Anti-Bacterial Agents": 1,
68
+ "antibiotic": 1922,
69
+ "arterial line placement": 0,
70
+ "beta blocker": 297,
71
+ "Bi-level Positive Airway Pressure": 6,
72
+ "bipap": 25,
73
+ "Blood Transfusion": 379,
74
+ "Bosmin": 0,
75
+ "Cardiopulmonary Resuscitation": 131,
76
+ "Cardioversion": 142,
77
+ "Catheterization Arterial": 0,
78
+ "Catheterization Central Venous": 0,
79
+ "central line placement": 6,
80
+ "compression dressing": 2,
81
+ "Computed Tomography": 518,
82
+ "cpap": 84,
83
+ "cpr": 151,
84
+ "crystalloids": 45,
85
+ "ct scan": 1036,
86
+ "Defibrillation": 96,
87
+ "Dopamine": 389,
88
+ "Dosage Forms": 210,
89
+ "dose": 5344,
90
+ "Drug Administration Routes": 0,
91
+ "Drug Therapy": 773,
92
+ "Epinephrine": 806,
93
+ "fluid": 2938,
94
+ "fluid resuscitation": 115,
95
+ "hemodynamic monitoring": 43,
96
+ "Hemodynamics": 135,
97
+ "Hemostasis": 180,
98
+ "Ibuprofen": 269,
99
+ "icu transfer": 9,
100
+ "Insulin": 808,
101
+ "intervention": 2695,
102
+ "intubation": 493,
103
+ "Intratracheal Intubation": 3,
104
+ "Intravenous Infusion": 576,
105
+ "iv fluids": 75,
106
+ "laboratory techniques": 29,
107
+ "laboratory testing": 296,
108
+ "levophed": 11,
109
+ "Lidocaine": 212,
110
+ "manage": 4416,
111
+ "management": 4008,
112
+ "medication": 4698,
113
+ "midazolam": 204,
114
+ "monitor": 4521,
115
+ "monitoring": 3593,
116
+ "Morphine": 289,
117
+ "Nebulization": 41,
118
+ "nitroglycerin": 125,
119
+ "NTG": 81,
120
+ "Norepinephrine": 392,
121
+ "normal saline": 252,
122
+ "Ondansetron": 43,
123
+ "Oxygen": 1779,
124
+ "Oxygen Inhalation Therapy": 2,
125
+ "oxygen therapy": 178,
126
+ "Patient Management": 281,
127
+ "Patient Monitoring": 107,
128
+ "POCUS": 10,
129
+ "point of care ultrasound": 2,
130
+ "procedural sedation": 26,
131
+ "procedure": 3073,
132
+ "radiologic imaging": 5,
133
+ "Radiography": 218,
134
+ "resuscitation": 539,
135
+ "Sedation": 602,
136
+ "splinting": 26,
137
+ "Splints": 29,
138
+ "supportive care": 564,
139
+ "surgical procedures": 482,
140
+ "Surgical Procedures Operative": 0,
141
+ "surgery": 3531,
142
+ "Suture": 179,
143
+ "Suturing": 53,
144
+ "Therapeutic Intervention": 181,
145
+ "Therapeutics": 182,
146
+ "Therapy": 6117,
147
+ "tourniquet": 56,
148
+ "transfusion": 826,
149
+ "treat": 8270,
150
+ "treatment": 7719,
151
+ "Ultrasonography Point of Care": 0,
152
+ "ultrasound": 1273,
153
+ "Vasoconstrictor Agents": 2,
154
+ "vasopressors": 188,
155
+ "ventilation support": 14,
156
+ "Ventilators": 86,
157
+ "Vital Signs": 459,
158
+ "vital signs monitoring": 1,
159
+ "wound care": 73,
160
+ "Wound Dressing": 30,
161
+ "Wound Management": 37,
162
+ "X-Ray": 1293
163
+ },
164
+ "cooccurrence_analysis": [
165
+ {
166
+ "emergency_keyword": "Fever",
167
+ "treatment_keyword": "treatment",
168
+ "cooccurrence_count": 3488,
169
+ "percentage": 37.23710899967973
170
+ },
171
+ {
172
+ "emergency_keyword": "Fever",
173
+ "treatment_keyword": "Therapy",
174
+ "cooccurrence_count": 2698,
175
+ "percentage": 28.803245436105477
176
+ },
177
+ {
178
+ "emergency_keyword": "Fever",
179
+ "treatment_keyword": "dose",
180
+ "cooccurrence_count": 2430,
181
+ "percentage": 25.94213729048788
182
+ },
183
+ {
184
+ "emergency_keyword": "Fever",
185
+ "treatment_keyword": "medication",
186
+ "cooccurrence_count": 1979,
187
+ "percentage": 21.127362015586634
188
+ },
189
+ {
190
+ "emergency_keyword": "Hypotension",
191
+ "treatment_keyword": "treatment",
192
+ "cooccurrence_count": 1760,
193
+ "percentage": 18.789366926443897
194
+ },
195
+ {
196
+ "emergency_keyword": "Fever",
197
+ "treatment_keyword": "management",
198
+ "cooccurrence_count": 1753,
199
+ "percentage": 18.714636489804633
200
+ },
201
+ {
202
+ "emergency_keyword": "Fever",
203
+ "treatment_keyword": "treat",
204
+ "cooccurrence_count": 1744,
205
+ "percentage": 18.618554499839863
206
+ },
207
+ {
208
+ "emergency_keyword": "Fever",
209
+ "treatment_keyword": "monitoring",
210
+ "cooccurrence_count": 1674,
211
+ "percentage": 17.87125013344721
212
+ },
213
+ {
214
+ "emergency_keyword": "Hypotension",
215
+ "treatment_keyword": "Therapy",
216
+ "cooccurrence_count": 1558,
217
+ "percentage": 16.63286004056795
218
+ },
219
+ {
220
+ "emergency_keyword": "Fever",
221
+ "treatment_keyword": "surgery",
222
+ "cooccurrence_count": 1505,
223
+ "percentage": 16.06704387744208
224
+ },
225
+ {
226
+ "emergency_keyword": "Tachycardia",
227
+ "treatment_keyword": "treatment",
228
+ "cooccurrence_count": 1441,
229
+ "percentage": 15.383794171025942
230
+ },
231
+ {
232
+ "emergency_keyword": "Hypotension",
233
+ "treatment_keyword": "dose",
234
+ "cooccurrence_count": 1423,
235
+ "percentage": 15.191630191096403
236
+ },
237
+ {
238
+ "emergency_keyword": "Myocardial Infarction",
239
+ "treatment_keyword": "treatment",
240
+ "cooccurrence_count": 1369,
241
+ "percentage": 14.615138251307783
242
+ },
243
+ {
244
+ "emergency_keyword": "Shock",
245
+ "treatment_keyword": "treatment",
246
+ "cooccurrence_count": 1340,
247
+ "percentage": 14.305540728087967
248
+ },
249
+ {
250
+ "emergency_keyword": "Fever",
251
+ "treatment_keyword": "fluid",
252
+ "cooccurrence_count": 1330,
253
+ "percentage": 14.198782961460447
254
+ },
255
+ {
256
+ "emergency_keyword": "Hemorrhage",
257
+ "treatment_keyword": "treatment",
258
+ "cooccurrence_count": 1328,
259
+ "percentage": 14.177431408134941
260
+ },
261
+ {
262
+ "emergency_keyword": "Hypotension",
263
+ "treatment_keyword": "monitoring",
264
+ "cooccurrence_count": 1325,
265
+ "percentage": 14.145404078146683
266
+ },
267
+ {
268
+ "emergency_keyword": "Tachycardia",
269
+ "treatment_keyword": "Therapy",
270
+ "cooccurrence_count": 1277,
271
+ "percentage": 13.632966798334579
272
+ },
273
+ {
274
+ "emergency_keyword": "Dyspnea",
275
+ "treatment_keyword": "treatment",
276
+ "cooccurrence_count": 1228,
277
+ "percentage": 13.10985374185972
278
+ },
279
+ {
280
+ "emergency_keyword": "Myocardial Infarction",
281
+ "treatment_keyword": "Therapy",
282
+ "cooccurrence_count": 1215,
283
+ "percentage": 12.97106864524394
284
+ }
285
+ ],
286
+ "path_b_validation": {
287
+ "avg_emergency_density": 0.3098621434407273,
288
+ "avg_treatment_density": 0.6108515041451529,
289
+ "high_density_records": 1298,
290
+ "precision_estimate": 0.9995729689334899
291
+ },
292
+ "condition_mapping_candidates": {}
293
+ }
dataset/check_source.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+
3
+ # 讀取剛剛下載並過濾後的 JSONL 檔案
4
+ df = pd.read_json("dataset/guidelines_source_filtered.jsonl", lines=True)
5
+
6
+ # 顯示各來源出現次數
7
+ print("📊 各來源出現次數:")
8
+ print(df["source"].value_counts())
9
+
10
+ # 驗證來源是否只有指定的 9 個
11
+ expected_sources = {"cco", "cdc", "cma", "icrc", "nice", "pubmed", "spor", "who", "wikidoc"}
12
+ actual_sources = set(df["source"].unique())
13
+
14
+ # 顯示驗證結果
15
+ if actual_sources == expected_sources:
16
+ print("✅ 來源完全符合預期,沒有其他來源。")
17
+ else:
18
+ print(f"❌ 發現未預期來源:{actual_sources - expected_sources}")
dataset/filter_guidelines.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # filter_guidelines.py
2
+
3
+ from datasets import load_dataset
4
+ import pandas as pd
5
+ import os
6
+
7
+ # ✅ 你信任的來源來源縮寫(Hugging Face dataset 中的 source 欄位)
8
+ approved_sources = ["cco", "cdc", "cma", "icrc", "nice", "pubmed", "spor", "who", "wikidoc"]
9
+
10
+ # Step 1: 從 Hugging Face 載入資料集
11
+ print("⏳ 載入資料中...")
12
+ ds = load_dataset("epfl-llm/guidelines", split="train")
13
+
14
+ # Step 2: 依據 source 欄位進行過濾
15
+ print("🔍 篩選可信來源中...")
16
+ ds_filtered = ds.filter(lambda ex: ex["source"] in approved_sources)
17
+ print(f"✅ 篩選完成,總共 {len(ds_filtered)} 筆資料。")
18
+
19
+ # Step 3: 轉成 pandas DataFrame
20
+ print("📄 轉換為 DataFrame...")
21
+ df = ds_filtered.to_pandas()
22
+
23
+ # Step 4: 建立 dataset 資料夾(如果不存在)
24
+ os.makedirs("dataset", exist_ok=True)
25
+
26
+ # Step 5: 儲存為 JSONL 與 CSV 到 dataset/ 資料夾中
27
+ print("💾 儲存到 dataset/ 資料夾...")
28
+ df.to_json("dataset/guidelines_source_filtered.jsonl", orient="records", lines=True)
29
+ df.to_csv("dataset/guidelines_source_filtered.csv", index=False)
30
+
31
+ print("🎉 完成!已儲存來自可信來源的資料。")
dataset/keywords/emergency_keywords.txt ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Acute abdomen
2
+ Acute bleeding
3
+ Acute Coronary Syndrome
4
+ Acute Kidney Injury
5
+ Acute pancreatitis
6
+ Acute respiratory distress syndrome
7
+ Acute stroke
8
+ Anaphylaxis
9
+ Anaphylactic Shock
10
+ Arrhythmia
11
+ Atrial fibrillation
12
+ Atrial flutter
13
+ Bradycardia
14
+ Cardiac arrest
15
+ Cardiogenic Shock
16
+ Chest pain
17
+ Dyspnea
18
+ Fever
19
+ Gastrointestinal Hemorrhage
20
+ GI bleeding
21
+ Hemorrhage
22
+ Hemorrhagic stroke
23
+ Hyperthermia
24
+ Hypovolemic Shock
25
+ Hypotension
26
+ Hypothermia
27
+ Internal bleeding
28
+ Intracranial Hemorrhages
29
+ Ischemic stroke
30
+ Loss of consciousness
31
+ Myocardial Infarction
32
+ MI
33
+ Pulmonary Edema
34
+ Pulmonary Embolism
35
+ Respiratory distress
36
+ Respiratory failure
37
+ Sepsis
38
+ Severe Sepsis
39
+ Septic Shock
40
+ Shock
41
+ Status Epilepticus
42
+ Syncope
43
+ Tachycardia
44
+ Tachypnea
45
+ Traumatic Brain Injury
46
+ Ventricular Tachycardia
47
+ Ventricular fibrillation
dataset/keywords/special_terms_emergency.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cardiac": {
3
+ "mi": ["mi", "m.i.", "myocardial infarction", "MI"],
4
+ "acs": ["acs", "ACS", "acute coronary syndrome"]
5
+ },
6
+ "respiratory": {
7
+ "ards": ["ards", "ARDS", "acute respiratory distress syndrome"],
8
+ "respiratory_failure": ["respiratory failure", "resp failure", "RF"]
9
+ },
10
+ "neurological": {
11
+ "loc": ["loc", "LOC", "loss of consciousness"],
12
+ "cva": ["cva", "CVA", "stroke", "cerebrovascular accident"]
13
+ },
14
+ "shock": {
15
+ "shock": ["shock", "circulatory failure"],
16
+ "septic_shock": ["septic shock", "sepsis induced shock"]
17
+ },
18
+ "bleeding": {
19
+ "gi_bleed": ["gi bleed", "gi bleeding", "gastrointestinal hemorrhage", "GI hemorrhage"],
20
+ "hemorrhage": ["hemorrhage", "bleeding", "blood loss"]
21
+ },
22
+ "vital_signs": {
23
+ "hypotension": ["hypotension", "low bp", "low blood pressure"],
24
+ "tachycardia": ["tachycardia", "elevated heart rate", "fast heart rate"]
25
+ }
26
+ }
dataset/keywords/special_terms_treatment.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "imaging": {
3
+ "x-ray": ["x-ray", "x ray", "xray", "XR"],
4
+ "ct": ["ct", "ct-scan", "cat scan", "computed tomography"],
5
+ "us": ["us", "u/s", "ultrasound", "sonography"]
6
+ },
7
+ "medications": {
8
+ "iv": ["iv", "i.v.", "intravenous"],
9
+ "im": ["im", "i.m.", "intramuscular"],
10
+ "po": ["po", "p.o.", "per os", "by mouth"]
11
+ },
12
+ "procedures": {
13
+ "cpr": ["cpr", "CPR", "cardiopulmonary resuscitation"],
14
+ "intubation": ["intubation", "ETT", "endotracheal tube"],
15
+ "cardioversion": ["cardioversion", "electrical cardioversion"]
16
+ },
17
+ "monitoring": {
18
+ "ecg": ["ecg", "ekg", "electrocardiogram"],
19
+ "monitoring": ["monitoring", "continuous observation"]
20
+ },
21
+ "ventilation": {
22
+ "bipap": ["bipap", "BiPAP", "bi-level positive airway pressure"],
23
+ "cpap": ["cpap", "CPAP", "continuous positive airway pressure"]
24
+ }
25
+ }
dataset/keywords/treatment_keywords.txt ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ACLS
2
+ administer
3
+ Adrenaline
4
+ Advanced Cardiac Life Support
5
+ Airway Management
6
+ alpha blocker
7
+ Amiodarone
8
+ analgesia
9
+ Anesthesia Procedural
10
+ Anti-Bacterial Agents
11
+ antibiotic
12
+ arterial line placement
13
+ beta blocker
14
+ Bi-level Positive Airway Pressure
15
+ bipap
16
+ Blood Transfusion
17
+ Bosmin
18
+ Cardiopulmonary Resuscitation
19
+ Cardioversion
20
+ Catheterization Arterial
21
+ Catheterization Central Venous
22
+ central line placement
23
+ compression dressing
24
+ Computed Tomography
25
+ cpap
26
+ cpr
27
+ crystalloids
28
+ ct scan
29
+ Defibrillation
30
+ Dopamine
31
+ Dosage Forms
32
+ dose
33
+ Drug Administration Routes
34
+ Drug Therapy
35
+ Epinephrine
36
+ fluid
37
+ fluid resuscitation
38
+ hemodynamic monitoring
39
+ Hemodynamics
40
+ Hemostasis
41
+ Ibuprofen
42
+ icu transfer
43
+ Insulin
44
+ intervention
45
+ intubation
46
+ Intratracheal Intubation
47
+ Intravenous Infusion
48
+ iv fluids
49
+ laboratory techniques
50
+ laboratory testing
51
+ levophed
52
+ Lidocaine
53
+ manage
54
+ management
55
+ medication
56
+ midazolam
57
+ monitor
58
+ monitoring
59
+ Morphine
60
+ Nebulization
61
+ nitroglycerin
62
+ NTG
63
+ Norepinephrine
64
+ normal saline
65
+ Ondansetron
66
+ Oxygen
67
+ Oxygen Inhalation Therapy
68
+ oxygen therapy
69
+ Patient Management
70
+ Patient Monitoring
71
+ POCUS
72
+ point of care ultrasound
73
+ procedural sedation
74
+ procedure
75
+ radiologic imaging
76
+ Radiography
77
+ resuscitation
78
+ Sedation
79
+ splinting
80
+ Splints
81
+ supportive care
82
+ surgical procedures
83
+ Surgical Procedures Operative
84
+ surgery
85
+ Suture
86
+ Suturing
87
+ Therapeutic Intervention
88
+ Therapeutics
89
+ Therapy
90
+ tourniquet
91
+ transfusion
92
+ treat
93
+ treatment
94
+ Ultrasonography Point of Care
95
+ ultrasound
96
+ Vasoconstrictor Agents
97
+ vasopressors
98
+ ventilation support
99
+ Ventilators
100
+ Vital Signs
101
+ vital signs monitoring
102
+ wound care
103
+ Wound Dressing
104
+ Wound Management
105
+ X-Ray
dataset/scripts/01_filter_emergency.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # scripts/01_filter_emergency.py
2
+
3
+ import os
4
+ import re
5
+ import pandas as pd
6
+
7
+ # Function: Load keywords and print progress
8
+ def load_keywords(path):
9
+ print(f"📥 Loading keywords from: {path}")
10
+ with open(path, "r", encoding="utf-8") as f:
11
+ kws = [line.strip() for line in f if line.strip()]
12
+ print(f" Loaded {len(kws)} keywords")
13
+ return kws
14
+
15
+ # Step 1: Read source data
16
+ print("1️⃣ Reading source data...")
17
+ source_path = "../dataset/guidelines_source_filtered.jsonl"
18
+ df = pd.read_json(source_path, lines=True)
19
+ print(f" Loaded {len(df)} records")
20
+
21
+ # Step 2: Load emergency keywords and match
22
+ print("2️⃣ Loading emergency keywords and matching...")
23
+ keywords = load_keywords("../keywords/emergency_keywords.txt")
24
+ pattern = r"\b(?:" + "|".join(keywords) + r")\b" # Using non-capturing groups (?:...)
25
+
26
+ # Match keywords and add metadata columns
27
+ df["matched"] = (
28
+ df["clean_text"]
29
+ .fillna("") # Convert NaN to empty string
30
+ .str.findall(pattern, flags=re.IGNORECASE)
31
+ .apply(lambda lst: "|".join(lst) if lst else "")
32
+ )
33
+ df["has_emergency"] = df["matched"].str.len() > 0
34
+
35
+ # Add metadata columns for future use
36
+ df["type"] = "emergency" # Document type identifier
37
+ df["condition"] = "" # Reserved for future condition mapping
38
+
39
+ # Calculate average matches
40
+ cnt_em = df["has_emergency"].sum()
41
+ avg_matches = (
42
+ df[df["has_emergency"]]["matched"]
43
+ .str.count(r"\|") # Escape the pipe
44
+ .add(1)
45
+ .mean()
46
+ )
47
+
48
+ print(f" Matched {cnt_em} emergency-related records")
49
+ print(f" Average keywords per record: {avg_matches:.2f}")
50
+
51
+ # Step 3: Save emergency subset
52
+ print("3️⃣ Saving emergency subset...")
53
+ out_dir = "../dataset/emergency"
54
+ os.makedirs(out_dir, exist_ok=True)
55
+ subset = df[df["has_emergency"]]
56
+ subset.to_json(f"{out_dir}/emergency_subset.jsonl", orient="records", lines=True)
57
+ subset.to_csv(f"{out_dir}/emergency_subset.csv", index=False)
58
+ print(f"✅ Complete! Generated emergency subset with {len(subset)} records, saved in `{out_dir}`")
dataset/scripts/01_filter_emergency_opt.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import json
4
+ import pandas as pd
5
+ from pathlib import Path
6
+
7
+ class MedicalTermProcessor:
8
+ def __init__(self):
9
+ # Load emergency special terms from JSON
10
+ keywords_dir = Path("../keywords")
11
+ with open(keywords_dir / "special_terms_emergency.json", "r") as f:
12
+ self.emergency_terms_by_category = json.load(f)
13
+
14
+ # Flatten the nested structure for easy lookup
15
+ self.emergency_special_terms = {}
16
+ for category in self.emergency_terms_by_category.values():
17
+ self.emergency_special_terms.update(category)
18
+
19
+ def get_all_variants(self):
20
+ """Get all term variants including special terms"""
21
+ variants = []
22
+ for term_list in self.emergency_special_terms.values():
23
+ variants.extend(term_list)
24
+ return variants
25
+
26
+ def standardize_term(self, term: str) -> str:
27
+ """Convert a term to its standard form if it's a variant"""
28
+ term_lower = term.lower()
29
+ for standard_term, variants in self.emergency_special_terms.items():
30
+ if term_lower in [v.lower() for v in variants]:
31
+ return standard_term
32
+ return term
33
+
34
+ def process_matches(self, matches: list) -> str:
35
+ """Process matches to standardize terms and remove duplicates"""
36
+ if not matches:
37
+ return ""
38
+
39
+ # Standardize terms
40
+ standardized = [self.standardize_term(match) for match in matches]
41
+
42
+ # Remove duplicates while preserving order
43
+ seen = set()
44
+ unique_matches = []
45
+ for term in standardized:
46
+ if term.lower() not in seen:
47
+ unique_matches.append(term)
48
+ seen.add(term.lower())
49
+
50
+ return "|".join(unique_matches)
51
+
52
+ # Function: Load keywords and print progress
53
+ def load_keywords(path, processor):
54
+ print(f"📥 Loading keywords from: {path}")
55
+ # Load basic keywords
56
+ with open(path, "r", encoding="utf-8") as f:
57
+ basic_kws = [line.strip() for line in f if line.strip()]
58
+
59
+ # Add special term variants
60
+ special_kws = processor.get_all_variants()
61
+ all_kws = list(set(basic_kws + special_kws)) # Remove duplicates
62
+
63
+ print(f" Loaded {len(all_kws)} keywords (including variants)")
64
+ return all_kws
65
+
66
+ # Step 1: Read source data
67
+ print("1️⃣ Reading source data...")
68
+ source_path = "../dataset/guidelines_source_filtered.jsonl"
69
+ df = pd.read_json(source_path, lines=True)
70
+ print(f" Loaded {len(df)} records")
71
+
72
+ # Step 2: Load emergency keywords and match
73
+ print("2️⃣ Loading emergency keywords and matching...")
74
+ processor = MedicalTermProcessor()
75
+ keywords = load_keywords("../keywords/emergency_keywords.txt", processor)
76
+ pattern = r"\b(?:" + "|".join(map(re.escape, keywords)) + r")\b"
77
+
78
+ # Match keywords and add metadata columns
79
+ df["matched"] = (
80
+ df["clean_text"]
81
+ .fillna("") # Convert NaN to empty string
82
+ .str.findall(pattern, flags=re.IGNORECASE)
83
+ .apply(lambda matches: processor.process_matches(matches)) # Use new process_matches method
84
+ )
85
+ df["has_emergency"] = df["matched"].str.len() > 0
86
+
87
+ # Add metadata columns for future use
88
+ df["type"] = "emergency" # Document type identifier
89
+ df["condition"] = "" # Reserved for future condition mapping
90
+
91
+ # Calculate average matches
92
+ cnt_em = df["has_emergency"].sum()
93
+ avg_matches = (
94
+ df[df["has_emergency"]]["matched"]
95
+ .str.count(r"\|") # Escape the pipe
96
+ .add(1)
97
+ .mean()
98
+ )
99
+
100
+ print(f" Matched {cnt_em} emergency-related records")
101
+ print(f" Average keywords per record: {avg_matches:.2f}")
102
+
103
+ # Step 3: Save emergency subset
104
+ print("3️⃣ Saving emergency subset...")
105
+ out_dir = "../dataset/emergency"
106
+ os.makedirs(out_dir, exist_ok=True)
107
+ subset = df[df["has_emergency"]]
108
+
109
+ # Save with _opt suffix to distinguish from original files
110
+ subset.to_json(f"{out_dir}/emergency_subset_opt.jsonl", orient="records", lines=True)
111
+ subset.to_csv(f"{out_dir}/emergency_subset_opt.csv", index=False)
112
+ print(f"✅ Complete! Generated emergency subset with {len(subset)} records, saved in `{out_dir}` with _opt suffix")
dataset/scripts/02_filter_treatment.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # scripts/02_filter_treatment.py
2
+
3
+ import os
4
+ import re
5
+ import pandas as pd
6
+
7
+ def preprocess_keywords(keywords_file):
8
+ """Load and preprocess treatment keywords"""
9
+ print(f"📥 Loading keywords from: {keywords_file}")
10
+
11
+ # Special medical terms with common variants
12
+ special_terms = {
13
+ 'x-ray': ['x-ray', 'x ray', 'xray'],
14
+ 'ct-scan': ['ct-scan', 'ct scan', 'ctscan'],
15
+ 'point-of-care': ['point-of-care', 'point of care']
16
+ }
17
+
18
+ # Read and preprocess keywords
19
+ with open(keywords_file, "r", encoding="utf-8") as f:
20
+ keywords = [line.strip().lower() for line in f if line.strip()]
21
+
22
+ # Process keywords and handle special terms
23
+ processed_keywords = []
24
+ for kw in keywords:
25
+ if kw in special_terms:
26
+ processed_keywords.extend(special_terms[kw])
27
+ else:
28
+ processed_keywords.append(kw)
29
+
30
+ print(f" Loaded {len(keywords)} base keywords")
31
+ print(f" Processed into {len(processed_keywords)} keyword variants")
32
+ return processed_keywords
33
+
34
+ def create_regex_pattern(keywords):
35
+ """Create compiled regex pattern with word boundaries"""
36
+ pattern = r"\b(?:" + "|".join(map(re.escape, keywords)) + r")\b"
37
+ return re.compile(pattern, re.IGNORECASE)
38
+
39
+ # Step 1: Read source data
40
+ print("1️⃣ Reading emergency subset...")
41
+ emergency_path = "../dataset/emergency/emergency_subset.jsonl"
42
+ df = pd.read_json(emergency_path, lines=True)
43
+ print(f" Loaded {len(df)} emergency records")
44
+ print(f" Contains emergency keywords in 'matched' column")
45
+
46
+ # Step 2: Load treatment keywords and match
47
+ print("2️⃣ Loading treatment keywords and matching...")
48
+ treatment_keywords = preprocess_keywords("../keywords/treatment_keywords.txt")
49
+ pattern = create_regex_pattern(treatment_keywords)
50
+
51
+ # Step 3: Process text and match keywords
52
+ print("3️⃣ Processing text and matching keywords...")
53
+ # Create lowercase version of text for matching
54
+ df['clean_text_lower'] = df['clean_text'].fillna('').str.lower()
55
+
56
+ # Match treatment keywords and add metadata columns
57
+ # Note: Preserving original 'matched' column from emergency subset
58
+ df["treatment_matched"] = (
59
+ df["clean_text_lower"]
60
+ .apply(lambda text: "|".join(pattern.findall(text)) or "")
61
+ )
62
+ df["has_treatment"] = df["treatment_matched"].str.len() > 0
63
+
64
+ # Add metadata columns for future use
65
+ df["type"] = "treatment" # Document type identifier
66
+ df["condition"] = "" # Reserved for future condition mapping
67
+
68
+ # Verify columns
69
+ print(" Verifying columns...")
70
+ print(f" - Emergency keywords column (matched): {df['matched'].notna().sum()} records")
71
+ print(f" - Treatment keywords column (treatment_matched): {df['treatment_matched'].notna().sum()} records")
72
+
73
+ # Calculate statistics
74
+ cnt_treat = df["has_treatment"].sum()
75
+ avg_matches = (
76
+ df[df["has_treatment"]]["treatment_matched"]
77
+ .str.count(r"\|")
78
+ .add(1)
79
+ .mean()
80
+ )
81
+
82
+ print(f" Found {cnt_treat} treatment-related records")
83
+ print(f" Average treatment keywords per record: {avg_matches:.2f}")
84
+
85
+ # Step 4: Save treatment subset
86
+ print("4️⃣ Saving treatment subset...")
87
+ out_dir = "../dataset/emergency_treatment"
88
+ os.makedirs(out_dir, exist_ok=True)
89
+
90
+ # Select records with treatment keywords
91
+ subset = df[df["has_treatment"]].copy() # Use copy to avoid SettingWithCopyWarning
92
+
93
+ # Verify final subset columns
94
+ print(" Final subset columns:")
95
+ print(f" - Emergency keywords (matched): {subset['matched'].notna().sum()} records")
96
+ print(f" - Treatment keywords (treatment_matched): {subset['treatment_matched'].notna().sum()} records")
97
+
98
+ subset.to_json(f"{out_dir}/emergency_treatment_subset.jsonl", orient="records", lines=True)
99
+ subset.to_csv(f"{out_dir}/emergency_treatment_subset.csv", index=False)
100
+
101
+ print(f"✅ Generated treatment subset with {len(subset)} records")
102
+ print(f" Saved in: {out_dir}")
103
+ print(f" Contains both emergency and treatment keywords")
dataset/scripts/02_filter_treatment_opt.py ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import json
4
+ import pandas as pd
5
+ from pathlib import Path
6
+
7
+ class MedicalTermProcessor:
8
+ def __init__(self):
9
+ # Load treatment special terms from JSON
10
+ keywords_dir = Path("../keywords")
11
+ with open(keywords_dir / "special_terms_treatment.json", "r") as f:
12
+ self.treatment_terms_by_category = json.load(f)
13
+
14
+ # Flatten the nested structure for easy lookup
15
+ self.treatment_special_terms = {}
16
+ for category in self.treatment_terms_by_category.values():
17
+ self.treatment_special_terms.update(category)
18
+
19
+ def get_all_variants(self):
20
+ """Get all term variants including special terms"""
21
+ variants = []
22
+ for term_list in self.treatment_special_terms.values():
23
+ variants.extend(term_list)
24
+ return variants
25
+
26
+ def standardize_term(self, term: str) -> str:
27
+ """Convert a term to its standard form if it's a variant"""
28
+ term_lower = term.lower()
29
+ for standard_term, variants in self.treatment_special_terms.items():
30
+ if term_lower in [v.lower() for v in variants]:
31
+ return standard_term
32
+ return term
33
+
34
+ def process_matches(self, matches: list) -> str:
35
+ """Process matches to standardize terms and remove duplicates"""
36
+ if not matches:
37
+ return ""
38
+
39
+ # Standardize terms
40
+ standardized = [self.standardize_term(match) for match in matches]
41
+
42
+ # Remove duplicates while preserving order
43
+ seen = set()
44
+ unique_matches = []
45
+ for term in standardized:
46
+ if term.lower() not in seen:
47
+ unique_matches.append(term)
48
+ seen.add(term.lower())
49
+
50
+ return "|".join(unique_matches)
51
+
52
+ def load_keywords(path, processor):
53
+ """Load and preprocess treatment keywords"""
54
+ print(f"📥 Loading keywords from: {path}")
55
+
56
+ # Load basic keywords
57
+ with open(path, "r", encoding="utf-8") as f:
58
+ basic_kws = [line.strip() for line in f if line.strip()]
59
+
60
+ # Add special term variants
61
+ special_kws = processor.get_all_variants()
62
+ all_kws = list(set(basic_kws + special_kws)) # Remove duplicates
63
+
64
+ print(f" Loaded {len(all_kws)} keywords (including variants)")
65
+ return all_kws
66
+
67
+ # Step 1: Read optimized emergency subset
68
+ print("1️⃣ Reading optimized emergency subset...")
69
+ emergency_path = "../dataset/emergency/emergency_subset_opt.jsonl"
70
+ df = pd.read_json(emergency_path, lines=True)
71
+ print(f" Loaded {len(df)} emergency records")
72
+ print(f" Contains emergency keywords in 'matched' column")
73
+
74
+ # Step 2: Load treatment keywords and match
75
+ print("2️⃣ Loading treatment keywords and matching...")
76
+ processor = MedicalTermProcessor()
77
+ keywords = load_keywords("../keywords/treatment_keywords.txt", processor)
78
+ pattern = r"\b(?:" + "|".join(map(re.escape, keywords)) + r")\b"
79
+
80
+ # Step 3: Process text and match keywords
81
+ print("3️⃣ Processing text and matching keywords...")
82
+ # Match treatment keywords and add metadata columns
83
+ df["treatment_matched"] = (
84
+ df["clean_text"]
85
+ .fillna("") # Convert NaN to empty string
86
+ .str.findall(pattern, flags=re.IGNORECASE)
87
+ .apply(lambda matches: processor.process_matches(matches)) # Use new process_matches method
88
+ )
89
+ df["has_treatment"] = df["treatment_matched"].str.len() > 0
90
+
91
+ # Add metadata columns for future use
92
+ df["type"] = "treatment" # Document type identifier
93
+ df["condition"] = "" # Reserved for future condition mapping
94
+
95
+ # Verify columns
96
+ print(" Verifying columns...")
97
+ print(f" - Emergency keywords column (matched): {df['matched'].notna().sum()} records")
98
+ print(f" - Treatment keywords column (treatment_matched): {df['treatment_matched'].notna().sum()} records")
99
+
100
+ # Calculate statistics
101
+ cnt_treat = df["has_treatment"].sum()
102
+ avg_matches = (
103
+ df[df["has_treatment"]]["treatment_matched"]
104
+ .str.count(r"\|")
105
+ .add(1)
106
+ .mean()
107
+ )
108
+
109
+ print(f" Found {cnt_treat} treatment-related records")
110
+ print(f" Average treatment keywords per record: {avg_matches:.2f}")
111
+
112
+ # Step 4: Save treatment subset
113
+ print("4️⃣ Saving treatment subset...")
114
+ out_dir = "../dataset/emergency_treatment"
115
+ os.makedirs(out_dir, exist_ok=True)
116
+
117
+ # Select records with treatment keywords
118
+ subset = df[df["has_treatment"]].copy() # Use copy to avoid SettingWithCopyWarning
119
+
120
+ # Verify final subset columns
121
+ print(" Final subset columns:")
122
+ print(f" - Emergency keywords (matched): {subset['matched'].notna().sum()} records")
123
+ print(f" - Treatment keywords (treatment_matched): {subset['treatment_matched'].notna().sum()} records")
124
+
125
+ # Save with _opt suffix
126
+ subset.to_json(f"{out_dir}/emergency_treatment_subset_opt.jsonl", orient="records", lines=True)
127
+ subset.to_csv(f"{out_dir}/emergency_treatment_subset_opt.csv", index=False)
128
+
129
+ print(f"✅ Generated optimized treatment subset with {len(subset)} records")
130
+ print(f" Saved in: {out_dir}")
131
+ print(f" Contains both emergency and treatment keywords")
dataset/scripts/check_subset_integrity.py ADDED
@@ -0,0 +1,178 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # /scripts/check_subset_integrity.py
3
+
4
+ import pandas as pd
5
+ import json
6
+ from pathlib import Path
7
+ from tqdm import tqdm
8
+
9
+ def check_subset_sample(file_path, sample_size=100):
10
+ """
11
+ Check the first N rows of the subset file
12
+ """
13
+ print(f"\n{'='*60}")
14
+ print(f"📊 Sampling Analysis (first {sample_size} rows)")
15
+ print(f"{'='*60}")
16
+
17
+ # Read sample
18
+ print(f"\n1️⃣ Reading sample from: {file_path}")
19
+ sample_df = pd.read_csv(file_path, nrows=sample_size)
20
+
21
+ # Basic information
22
+ print("\n2️⃣ Basic Information:")
23
+ print(f" Columns present: {', '.join(sample_df.columns.tolist())}")
24
+
25
+ # Check matched columns
26
+ print("\n3️⃣ Matched Columns Status:")
27
+ matched_stats = {
28
+ 'matched': {
29
+ 'non_null': int(sample_df['matched'].notna().sum()),
30
+ 'non_empty': int((sample_df['matched'].str.len() > 0).sum()),
31
+ 'unique_values': sample_df['matched'].nunique()
32
+ },
33
+ 'treatment_matched': {
34
+ 'non_null': int(sample_df['treatment_matched'].notna().sum()),
35
+ 'non_empty': int((sample_df['treatment_matched'].str.len() > 0).sum()),
36
+ 'unique_values': sample_df['treatment_matched'].nunique()
37
+ }
38
+ }
39
+
40
+ for col, stats in matched_stats.items():
41
+ print(f"\n {col}:")
42
+ print(f" - Non-null count: {stats['non_null']}/{sample_size}")
43
+ print(f" - Non-empty count: {stats['non_empty']}/{sample_size}")
44
+ print(f" - Unique values: {stats['unique_values']}")
45
+
46
+ # Sample rows with both matches
47
+ print("\n4️⃣ Sample Rows with Both Matches:")
48
+ both_matched = sample_df[
49
+ (sample_df['matched'].notna() & sample_df['matched'].str.len() > 0) &
50
+ (sample_df['treatment_matched'].notna() & sample_df['treatment_matched'].str.len() > 0)
51
+ ].head(3)
52
+
53
+ for idx, row in both_matched.iterrows():
54
+ print(f"\n Row {idx}:")
55
+ print(f" - Emergency keywords: {row['matched']}")
56
+ print(f" - Treatment keywords: {row['treatment_matched']}")
57
+
58
+ return matched_stats
59
+
60
+ def analyze_large_file(file_path, chunk_size=1000):
61
+ """
62
+ Analyze the entire file in chunks
63
+ """
64
+ print(f"\n{'='*60}")
65
+ print(f"📈 Full File Analysis (chunk size: {chunk_size})")
66
+ print(f"{'='*60}")
67
+
68
+ stats = {
69
+ 'total_rows': 0,
70
+ 'matched_stats': {
71
+ 'non_null': 0,
72
+ 'non_empty': 0
73
+ },
74
+ 'treatment_matched_stats': {
75
+ 'non_null': 0,
76
+ 'non_empty': 0
77
+ },
78
+ 'both_matched': 0
79
+ }
80
+
81
+ print("\n1️⃣ Processing file in chunks...")
82
+ chunks = pd.read_csv(file_path, chunksize=chunk_size)
83
+
84
+ for chunk in tqdm(chunks, desc="Analyzing chunks"):
85
+ # Update total rows
86
+ stats['total_rows'] += len(chunk)
87
+
88
+ # Update matched stats
89
+ stats['matched_stats']['non_null'] += chunk['matched'].notna().sum()
90
+ stats['matched_stats']['non_empty'] += (chunk['matched'].str.len() > 0).sum()
91
+
92
+ # Update treatment_matched stats
93
+ stats['treatment_matched_stats']['non_null'] += chunk['treatment_matched'].notna().sum()
94
+ stats['treatment_matched_stats']['non_empty'] += (chunk['treatment_matched'].str.len() > 0).sum()
95
+
96
+ # Update both matched count
97
+ stats['both_matched'] += (
98
+ (chunk['matched'].notna() & chunk['matched'].str.len() > 0) &
99
+ (chunk['treatment_matched'].notna() & chunk['treatment_matched'].str.len() > 0)
100
+ ).sum()
101
+
102
+ return stats
103
+
104
+ def generate_report(sample_stats, full_stats, output_dir):
105
+ """
106
+ Generate and save analysis report
107
+ """
108
+ print(f"\n{'='*60}")
109
+ print(f"📝 Generating Report")
110
+ print(f"{'='*60}")
111
+
112
+ report = {
113
+ 'sample_analysis': sample_stats,
114
+ 'full_file_analysis': {
115
+ 'total_records': int(full_stats['total_rows']),
116
+ 'matched_column': {
117
+ 'non_null_count': int(full_stats['matched_stats']['non_null']),
118
+ 'non_empty_count': int(full_stats['matched_stats']['non_empty']),
119
+ 'null_percentage': float(
120
+ (full_stats['total_rows'] - full_stats['matched_stats']['non_null'])
121
+ / full_stats['total_rows'] * 100
122
+ )
123
+ },
124
+ 'treatment_matched_column': {
125
+ 'non_null_count': int(full_stats['treatment_matched_stats']['non_null']),
126
+ 'non_empty_count': int(full_stats['treatment_matched_stats']['non_empty']),
127
+ 'null_percentage': float(
128
+ (full_stats['total_rows'] - full_stats['treatment_matched_stats']['non_null'])
129
+ / full_stats['total_rows'] * 100
130
+ )
131
+ },
132
+ 'both_matched_count': int(full_stats['both_matched']),
133
+ 'both_matched_percentage': float(
134
+ full_stats['both_matched'] / full_stats['total_rows'] * 100
135
+ )
136
+ }
137
+ }
138
+
139
+ # Create output directory
140
+ output_dir = Path(output_dir)
141
+ output_dir.mkdir(parents=True, exist_ok=True)
142
+
143
+ # Save report
144
+ report_file = output_dir / 'integrity_check_report.json'
145
+ with open(report_file, 'w', encoding='utf-8') as f:
146
+ json.dump(report, f, indent=2, ensure_ascii=False)
147
+
148
+ print(f"\nReport saved to: {report_file}")
149
+
150
+ # Print summary
151
+ print("\n📊 Summary:")
152
+ print(f"Total records: {report['full_file_analysis']['total_records']}")
153
+ print(f"Records with both matches: {report['full_file_analysis']['both_matched_count']} "
154
+ f"({report['full_file_analysis']['both_matched_percentage']:.2f}%)")
155
+
156
+ return report
157
+
158
+ def main():
159
+ """
160
+ Main execution function
161
+ """
162
+ # Configuration
163
+ input_file = "../dataset/emergency_treatment/emergency_treatment_subset.csv"
164
+ output_dir = "../analysis/integrity_check"
165
+
166
+ print(f"\n🔍 Starting Subset Integrity Check")
167
+ print(f"Input file: {input_file}")
168
+ print(f"Output directory: {output_dir}")
169
+
170
+ # Run analysis
171
+ sample_stats = check_subset_sample(input_file)
172
+ full_stats = analyze_large_file(input_file)
173
+ report = generate_report(sample_stats, full_stats, output_dir)
174
+
175
+ print("\n✅ Integrity check complete!")
176
+
177
+ if __name__ == "__main__":
178
+ main()
dataset/scripts/commit_message_20250726_special_terms.txt ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ refactor: migrate special terms to JSON configuration
2
+
3
+ BREAKING CHANGE: Move hardcoded special terms mapping to external JSON files
4
+
5
+ 1. Create New Configuration Files:
6
+ - Add special_terms_emergency.json
7
+ - Organize emergency terms by categories (cardiac, respiratory, etc.)
8
+ - Include all existing mappings with standardized structure
9
+ - Add special_terms_treatment.json
10
+ - Organize treatment terms by categories (imaging, medications, etc.)
11
+ - Maintain all existing term variants
12
+
13
+ 2. Update Processing Scripts:
14
+ - Modify 01_filter_emergency_opt.py:
15
+ - Load terms from JSON configuration
16
+ - Add term standardization
17
+ - Implement deduplication
18
+ - Preserve category information
19
+ - Modify 02_filter_treatment_opt.py:
20
+ - Similar updates for treatment terms
21
+ - Maintain consistent processing logic
22
+
23
+ 3. New Features:
24
+ - Term standardization: Convert variants to standard form
25
+ - Deduplication: Remove repeated terms while preserving order
26
+ - Category-aware: Support for term categorization
27
+ - Improved maintainability: Configuration separated from code
28
+
29
+ 4. Technical Details:
30
+ - Use pathlib for file path handling
31
+ - JSON structure supports hierarchical organization
32
+ - Maintain backward compatibility
33
+ - Add type hints for better code clarity
34
+
35
+ Testing:
36
+ - Verify JSON format
37
+ - Confirm all mappings migrated correctly
38
+ - Check term standardization
39
+ - Validate deduplication logic
dataset/scripts/compare_subsets_opt.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # /scripts/compare_subsets_opt.py
2
+ import pandas as pd
3
+ from pathlib import Path
4
+ from datetime import datetime
5
+
6
+ def load_and_compare_subsets(format_type='csv'):
7
+ """
8
+ Load and compare the first 10 records from both optimized subsets
9
+
10
+ Args:
11
+ format_type (str): 'csv' or 'jsonl'
12
+ """
13
+ # Prepare output file
14
+ output_dir = Path("../analysis")
15
+ output_dir.mkdir(exist_ok=True)
16
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
17
+ output_file = output_dir / f"subset_comparison_first10_records_{timestamp}.md"
18
+
19
+ # Initialize markdown content
20
+ md_content = []
21
+ md_content.append("# Optimized Subsets Comparison Report\n")
22
+ md_content.append(f"Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
23
+ md_content.append(f"File format: {format_type.upper()}\n")
24
+
25
+ # Set file paths based on format
26
+ if format_type == 'csv':
27
+ emergency_path = "../dataset/emergency/emergency_subset_opt.csv"
28
+ treatment_path = "../dataset/emergency_treatment/emergency_treatment_subset_opt.csv"
29
+ # Load CSV files
30
+ emergency_df = pd.read_csv(emergency_path)
31
+ treatment_df = pd.read_csv(treatment_path)
32
+ else: # jsonl
33
+ emergency_path = "../dataset/emergency/emergency_subset_opt.jsonl"
34
+ treatment_path = "../dataset/emergency_treatment/emergency_treatment_subset_opt.jsonl"
35
+ # Load JSONL files
36
+ emergency_df = pd.read_json(emergency_path, lines=True)
37
+ treatment_df = pd.read_json(treatment_path, lines=True)
38
+
39
+ # Print and save basic statistics
40
+ print("\n📊 Basic Statistics:")
41
+ print("-" * 40)
42
+ md_content.append("\n## Basic Statistics\n")
43
+
44
+ stats = [
45
+ f"- Emergency subset total records: {len(emergency_df)}",
46
+ f"- Emergency+Treatment subset total records: {len(treatment_df)}",
47
+ f"- Avg Emergency Text Length: {emergency_df['clean_text'].str.len().mean():.2f}",
48
+ f"- Avg Treatment Text Length: {treatment_df['clean_text'].str.len().mean():.2f}"
49
+ ]
50
+
51
+ # Calculate average keywords using pattern
52
+ pattern = r'\|'
53
+ emergency_avg = emergency_df['matched'].str.count(pattern).add(1).mean()
54
+ treatment_avg = treatment_df['matched'].str.count(pattern).add(1).mean()
55
+
56
+ stats.extend([
57
+ f"- Avg Emergency Keywords: {emergency_avg:.2f}",
58
+ f"- Avg Treatment Keywords: {treatment_avg:.2f}"
59
+ ])
60
+
61
+ # Print to console and add to markdown
62
+ for stat in stats:
63
+ print(stat.replace("- ", ""))
64
+ md_content.extend(stats)
65
+
66
+ # Compare first 10 records from Emergency subset
67
+ print("\n🔍 First 10 records from Emergency Subset:")
68
+ print("-" * 80)
69
+ md_content.append("\n## Emergency Subset (First 10 Records)\n")
70
+
71
+ for idx, row in emergency_df.head(10).iterrows():
72
+ print(f"\nRecord #{idx+1}")
73
+ print(f"Text preview: {row['clean_text'][:100]}...")
74
+ print(f"Matched keywords: {row['matched']}")
75
+ print(f"Text length: {len(row['clean_text'])}")
76
+ print("-" * 40)
77
+
78
+ md_content.extend([
79
+ f"\n### Record {idx+1}",
80
+ "```",
81
+ f"Text preview: {row['clean_text'][:100]}...",
82
+ f"Matched keywords: {row['matched']}",
83
+ f"Text length: {len(row['clean_text'])}",
84
+ "```\n"
85
+ ])
86
+
87
+ # Compare first 10 records from Emergency+Treatment subset
88
+ print("\n🔍 First 10 records from Emergency+Treatment Subset:")
89
+ print("-" * 80)
90
+ md_content.append("\n## Emergency+Treatment Subset (First 10 Records)\n")
91
+
92
+ for idx, row in treatment_df.head(10).iterrows():
93
+ print(f"\nRecord #{idx+1}")
94
+ print(f"Text preview: {row['clean_text'][:100]}...")
95
+ print(f"Emergency keywords: {row['matched']}")
96
+ print(f"Treatment keywords: {row['treatment_matched']}")
97
+ print(f"Text length: {len(row['clean_text'])}")
98
+ print("-" * 40)
99
+
100
+ md_content.extend([
101
+ f"\n### Record {idx+1}",
102
+ "```",
103
+ f"Text preview: {row['clean_text'][:100]}...",
104
+ f"Emergency keywords: {row['matched']}",
105
+ f"Treatment keywords: {row['treatment_matched']}",
106
+ f"Text length: {len(row['clean_text'])}",
107
+ "```\n"
108
+ ])
109
+
110
+ # Save markdown content
111
+ with open(output_file, 'w', encoding='utf-8') as f:
112
+ f.write('\n'.join(md_content))
113
+
114
+ print(f"\n✅ Comparison complete!")
115
+ print(f"Report saved to: {output_file}")
116
+
117
+ if __name__ == "__main__":
118
+ # Compare using CSV format
119
+ print("\nComparing CSV files...")
120
+ load_and_compare_subsets('csv')
121
+
122
+ # Compare using JSONL format
123
+ print("\nComparing JSONL files...")
124
+ load_and_compare_subsets('jsonl')
dataset/scripts/data_explorer.py ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # /scripts/data_explorer.py
2
+ import pandas as pd
3
+ import matplotlib.pyplot as plt
4
+ import seaborn as sns
5
+ import numpy as np
6
+ from pathlib import Path
7
+ import json
8
+
9
+ def analyze_subset(file_path, keywords_path, output_dir="analysis"):
10
+ """Analyze subset data quality and distribution"""
11
+ print(f"\n{'='*50}")
12
+ print(f"Starting dataset analysis: {file_path}")
13
+ print(f"Using keywords file: {keywords_path}")
14
+ print(f"Output directory: {output_dir}")
15
+ print(f"{'='*50}\n")
16
+
17
+ # Load data
18
+ print("1️⃣ Loading data...")
19
+ df = pd.read_csv(file_path)
20
+ output_dir = Path(output_dir)
21
+
22
+ # 1. Basic statistics
23
+ print("\n2️⃣ Calculating basic statistics...")
24
+ total = len(df)
25
+ df['text_length'] = df['clean_text'].str.len()
26
+ avg_len = df['text_length'].mean()
27
+ print(f"Total records: {total}")
28
+ print(f"Average text length: {avg_len:.2f}")
29
+
30
+ # Initialize statistics dictionary with native Python types
31
+ stats = {
32
+ 'basic_statistics': {
33
+ 'total_records': int(total),
34
+ 'avg_length': float(avg_len)
35
+ },
36
+ 'keyword_statistics': {}
37
+ }
38
+
39
+ # 2. Keyword analysis
40
+ print("\n3️⃣ Performing keyword analysis...")
41
+ with open(keywords_path, 'r') as f:
42
+ keywords = [line.strip() for line in f if line.strip()]
43
+ print(f"Loaded {len(keywords)} keywords")
44
+
45
+ # Count keywords and store in stats
46
+ for keyword in keywords:
47
+ cnt = df['clean_text'].str.contains(keyword, case=False).sum()
48
+ stats['keyword_statistics'][keyword] = int(cnt)
49
+ print(f" - {keyword}: {cnt} records")
50
+
51
+ # 3. Visualization
52
+ print("\n4️⃣ Generating visualizations...")
53
+ output_path = Path(output_dir) / "plots"
54
+ output_path.mkdir(parents=True, exist_ok=True)
55
+ print(f"Charts will be saved in: {output_path}")
56
+
57
+ # 3.1 Keyword distribution chart
58
+ print(" - Generating keyword distribution chart...")
59
+ plt.figure(figsize=(15, 8))
60
+ plt.bar(stats['keyword_statistics'].keys(), stats['keyword_statistics'].values())
61
+ plt.xticks(rotation=45, ha='right')
62
+ # TODO: change the title to the name of the subset
63
+ plt.title('Keyword Distribution for Emergency Subset')
64
+ plt.xlabel('Keywords')
65
+ plt.ylabel('Match Count')
66
+ # TODO: change the name of the file to the name of the subset
67
+ plt.savefig(output_path / "keyword_distribution_emergency_subset.png", bbox_inches='tight')
68
+ plt.close()
69
+
70
+ # 3.2 Text length distribution
71
+ print(" - Generating text length distribution...")
72
+ plt.figure(figsize=(10, 6))
73
+ df['text_length'].hist(bins=50)
74
+ plt.title('Text Length Distribution')
75
+ plt.xlabel('Text Length')
76
+ plt.ylabel('Frequency')
77
+ # TODO: change the name of the file to the name of the subset
78
+ plt.savefig(output_path / "text_length_dist_emergency_subset.png", bbox_inches='tight')
79
+ plt.close()
80
+
81
+ # 3.3 Keyword co-occurrence analysis
82
+ print(" - Generating keyword co-occurrence heatmap...")
83
+ cooccurrence_matrix = np.zeros((len(keywords), len(keywords)))
84
+ for text in df['clean_text']:
85
+ present_keywords = [k for k in keywords if k.lower() in text.lower()]
86
+ for i, k1 in enumerate(present_keywords):
87
+ for j, k2 in enumerate(present_keywords):
88
+ if i != j:
89
+ cooccurrence_matrix[keywords.index(k1)][keywords.index(k2)] += 1
90
+
91
+ plt.figure(figsize=(12, 8))
92
+ sns.heatmap(cooccurrence_matrix,
93
+ xticklabels=keywords,
94
+ yticklabels=keywords,
95
+ cmap='YlOrRd')
96
+ plt.title('Keyword Co-occurrence Heatmap')
97
+ plt.xticks(rotation=45, ha='right')
98
+ plt.tight_layout()
99
+ # TODO: change the name of the file to the name of the subset
100
+ plt.savefig(output_path / "keyword_cooccurrence_emergency_subset.png", bbox_inches='tight')
101
+ plt.close()
102
+
103
+ # 4. Save statistics
104
+ print("\n5️⃣ Saving statistics...")
105
+ stats_path = Path(output_dir) / "stats"
106
+ stats_path.mkdir(parents=True, exist_ok=True)
107
+ # TODO: change the name of the file to the name of the subset
108
+ stats_file = stats_path / "analysis_stats_emergency_subset.json"
109
+
110
+ with open(stats_file, 'w', encoding='utf-8') as f:
111
+ json.dump(stats, f, indent=2, ensure_ascii=False)
112
+ print(f"Statistics saved to: {stats_file}")
113
+
114
+ print(f"\n✅ Analysis complete! All results saved to {output_dir} directory")
115
+
116
+ if __name__ == "__main__":
117
+ # Set file paths
118
+ emergency_subset = "../dataset/emergency/emergency_subset.csv"
119
+ emergency_keywords = "../keywords/emergency_keywords.txt"
120
+ output_dir = "../analysis"
121
+
122
+ # Run analysis
123
+ analyze_subset(emergency_subset, emergency_keywords, output_dir)
dataset/scripts/data_explorer_opt.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # /scripts/data_explorer_opt.py
2
+ import pandas as pd
3
+ import matplotlib.pyplot as plt
4
+ import seaborn as sns
5
+ import numpy as np
6
+ from pathlib import Path
7
+ import json
8
+
9
+ def analyze_subset(file_path, keywords_path, output_dir="analysis", subset_name="emergency"):
10
+ """Analyze subset data quality and distribution"""
11
+ print(f"\n{'='*50}")
12
+ print(f"Starting optimized dataset analysis: {file_path}")
13
+ print(f"Using keywords file: {keywords_path}")
14
+ print(f"Output directory: {output_dir}")
15
+ print(f"{'='*50}\n")
16
+
17
+ # Load data
18
+ print("1️⃣ Loading data...")
19
+ df = pd.read_csv(file_path)
20
+ output_dir = Path(output_dir)
21
+
22
+ # 1. Basic statistics
23
+ print("\n2️⃣ Calculating basic statistics...")
24
+ total = len(df)
25
+ df['text_length'] = df['clean_text'].str.len()
26
+ avg_len = df['text_length'].mean()
27
+ print(f"Total records: {total}")
28
+ print(f"Average text length: {avg_len:.2f}")
29
+
30
+ # Initialize statistics dictionary with native Python types
31
+ stats = {
32
+ 'basic_statistics': {
33
+ 'total_records': int(total),
34
+ 'avg_length': float(avg_len)
35
+ },
36
+ 'keyword_statistics': {}
37
+ }
38
+
39
+ # 2. Keyword analysis
40
+ print("\n3️⃣ Performing keyword analysis...")
41
+ with open(keywords_path, 'r') as f:
42
+ keywords = [line.strip() for line in f if line.strip()]
43
+ print(f"Loaded {len(keywords)} keywords")
44
+
45
+ # Count keywords and store in stats
46
+ for keyword in keywords:
47
+ cnt = df['clean_text'].str.contains(keyword, case=False).sum()
48
+ stats['keyword_statistics'][keyword] = int(cnt)
49
+ print(f" - {keyword}: {cnt} records")
50
+
51
+ # 3. Visualization
52
+ print("\n4️⃣ Generating visualizations...")
53
+ output_path = Path(output_dir) / "plots"
54
+ output_path.mkdir(parents=True, exist_ok=True)
55
+ print(f"Charts will be saved in: {output_path}")
56
+
57
+ # 3.1 Keyword distribution chart
58
+ print(" - Generating keyword distribution chart...")
59
+ plt.figure(figsize=(15, 8))
60
+ plt.bar(stats['keyword_statistics'].keys(), stats['keyword_statistics'].values())
61
+ plt.xticks(rotation=45, ha='right')
62
+ plt.title(f'Keyword Distribution for {subset_name.capitalize()} Subset (Optimized)')
63
+ plt.xlabel('Keywords')
64
+ plt.ylabel('Match Count')
65
+ plt.savefig(output_path / f"keyword_distribution_{subset_name}_subset_opt.png", bbox_inches='tight')
66
+ plt.close()
67
+
68
+ # 3.2 Text length distribution
69
+ print(" - Generating text length distribution...")
70
+ plt.figure(figsize=(10, 6))
71
+ df['text_length'].hist(bins=50)
72
+ plt.title(f'Text Length Distribution ({subset_name.capitalize()} Subset - Optimized)')
73
+ plt.xlabel('Text Length')
74
+ plt.ylabel('Frequency')
75
+ plt.savefig(output_path / f"text_length_dist_{subset_name}_subset_opt.png", bbox_inches='tight')
76
+ plt.close()
77
+
78
+ # 3.3 Keyword co-occurrence analysis
79
+ print(" - Generating keyword co-occurrence heatmap...")
80
+ cooccurrence_matrix = np.zeros((len(keywords), len(keywords)))
81
+ for text in df['clean_text']:
82
+ present_keywords = [k for k in keywords if k.lower() in text.lower()]
83
+ for i, k1 in enumerate(present_keywords):
84
+ for j, k2 in enumerate(present_keywords):
85
+ if i != j:
86
+ cooccurrence_matrix[keywords.index(k1)][keywords.index(k2)] += 1
87
+
88
+ plt.figure(figsize=(12, 8))
89
+ sns.heatmap(cooccurrence_matrix,
90
+ xticklabels=keywords,
91
+ yticklabels=keywords,
92
+ cmap='YlOrRd')
93
+ plt.title(f'Keyword Co-occurrence Heatmap ({subset_name.capitalize()} Subset - Optimized)')
94
+ plt.xticks(rotation=45, ha='right')
95
+ plt.tight_layout()
96
+ plt.savefig(output_path / f"keyword_cooccurrence_{subset_name}_subset_opt.png", bbox_inches='tight')
97
+ plt.close()
98
+
99
+ # 4. Save statistics
100
+ print("\n5️⃣ Saving statistics...")
101
+ stats_path = Path(output_dir) / "stats"
102
+ stats_path.mkdir(parents=True, exist_ok=True)
103
+ stats_file = stats_path / f"analysis_stats_{subset_name}_subset_opt.json"
104
+
105
+ with open(stats_file, 'w', encoding='utf-8') as f:
106
+ json.dump(stats, f, indent=2, ensure_ascii=False)
107
+ print(f"Statistics saved to: {stats_file}")
108
+
109
+ print(f"\n✅ Analysis complete! All results saved to {output_dir} directory")
110
+
111
+ if __name__ == "__main__":
112
+ # Set file paths for optimized version
113
+ emergency_subset = "../dataset/emergency/emergency_subset_opt.csv"
114
+ emergency_keywords = "../keywords/emergency_keywords.txt"
115
+ output_dir = "../analysis"
116
+
117
+ # Run analysis
118
+ analyze_subset(emergency_subset, emergency_keywords, output_dir, "emergency")
dataset/scripts/data_explorer_treatment.py ADDED
@@ -0,0 +1,265 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # /scripts/data_explorer_treatment.py
2
+ import pandas as pd
3
+ import matplotlib.pyplot as plt
4
+ import seaborn as sns
5
+ import numpy as np
6
+ from pathlib import Path
7
+ import json
8
+ from tqdm import tqdm
9
+ import re
10
+
11
+ def calculate_density(matches, text_length):
12
+ """
13
+ Calculate keyword density per 1000 words
14
+
15
+ Args:
16
+ matches: Number of keyword matches
17
+ text_length: Total text length
18
+
19
+ Returns:
20
+ float: Density per 1000 words
21
+ """
22
+ return (matches / text_length) * 1000
23
+
24
+ def analyze_treatment_subset(
25
+ treatment_file_path,
26
+ emergency_keywords_path,
27
+ treatment_keywords_path,
28
+ output_dir="analysis_treatment"
29
+ ):
30
+ """
31
+ Specialized analysis for treatment subset focusing on:
32
+ 1. Dual keyword analysis (emergency + treatment)
33
+ 2. Path B effectiveness validation
34
+ 3. Condition mapping data preparation
35
+ 4. RAG readiness assessment
36
+ """
37
+ print(f"\n{'='*60}")
38
+ print(f"Treatment Subset Analysis")
39
+ print(f"Treatment file: {treatment_file_path}")
40
+ print(f"Emergency keywords: {emergency_keywords_path}")
41
+ print(f"Treatment keywords: {treatment_keywords_path}")
42
+ print(f"Output directory: {output_dir}")
43
+ print(f"{'='*60}\n")
44
+
45
+ # Load data
46
+ print("1️⃣ Loading treatment subset data...")
47
+ df = pd.read_csv(treatment_file_path)
48
+ output_dir = Path(output_dir)
49
+
50
+ # Load keyword lists
51
+ print("2️⃣ Loading keyword lists...")
52
+ with open(emergency_keywords_path, 'r', encoding='utf-8') as f:
53
+ emergency_keywords = [line.strip() for line in f if line.strip()]
54
+
55
+ with open(treatment_keywords_path, 'r', encoding='utf-8') as f:
56
+ treatment_keywords = [line.strip() for line in f if line.strip()]
57
+
58
+ print(f" Emergency keywords: {len(emergency_keywords)}")
59
+ print(f" Treatment keywords: {len(treatment_keywords)}")
60
+
61
+ # Basic statistics
62
+ print("\n3️⃣ Computing basic statistics...")
63
+ total_records = len(df)
64
+ df['text_length'] = df['clean_text'].str.len()
65
+ avg_length = df['text_length'].mean()
66
+
67
+ print(f" Total treatment records: {total_records}")
68
+ print(f" Average text length: {avg_length:.2f} characters")
69
+
70
+ # Initialize comprehensive statistics
71
+ stats = {
72
+ 'basic_statistics': {
73
+ 'total_records': int(total_records),
74
+ 'avg_text_length': float(avg_length),
75
+ 'emergency_keywords_count': len(emergency_keywords),
76
+ 'treatment_keywords_count': len(treatment_keywords)
77
+ },
78
+ 'emergency_keyword_stats': {},
79
+ 'treatment_keyword_stats': {},
80
+ 'cooccurrence_analysis': {},
81
+ 'path_b_validation': {},
82
+ 'condition_mapping_candidates': {}
83
+ }
84
+
85
+ # Emergency keyword analysis in treatment subset
86
+ print("\n4️⃣ Analyzing emergency keywords in treatment subset...")
87
+ for keyword in emergency_keywords:
88
+ count = df['clean_text'].str.contains(keyword, case=False, na=False).sum()
89
+ stats['emergency_keyword_stats'][keyword] = int(count)
90
+ print(f" Emergency: {keyword} -> {count} records")
91
+
92
+ # Treatment keyword analysis
93
+ print("\n5️⃣ Analyzing treatment keywords...")
94
+ for keyword in treatment_keywords:
95
+ count = df['clean_text'].str.contains(keyword, case=False, na=False).sum()
96
+ stats['treatment_keyword_stats'][keyword] = int(count)
97
+ print(f" Treatment: {keyword} -> {count} records")
98
+
99
+ # Step 6: Co-occurrence analysis
100
+ print("\n6️⃣ Computing keyword co-occurrence patterns...")
101
+
102
+ # Initialize matrices for full dataset
103
+ emergency_matrix = np.zeros((len(df), len(emergency_keywords)), dtype=bool)
104
+ treatment_matrix = np.zeros((len(df), len(treatment_keywords)), dtype=bool)
105
+
106
+ # Pre-process text
107
+ print(" Pre-processing text...")
108
+ df['clean_text_lower'] = df['clean_text'].fillna('').str.lower()
109
+
110
+ # Process all emergency keywords
111
+ print("\n Processing all emergency keywords...")
112
+ for i, keyword in enumerate(tqdm(emergency_keywords, desc="Emergency keywords")):
113
+ # Using word boundary instead of negative lookbehind/lookahead
114
+ pattern = r'\b' + re.escape(keyword.lower()) + r'\b'
115
+ emergency_matrix[:, i] = df['clean_text_lower'].str.contains(pattern, regex=True, na=False)
116
+ matches = emergency_matrix[:, i].sum()
117
+ print(f" - {keyword}: {matches} matches")
118
+
119
+ # Process all treatment keywords
120
+ print("\n Processing all treatment keywords...")
121
+ for i, keyword in enumerate(tqdm(treatment_keywords, desc="Treatment keywords")):
122
+ # Using word boundary instead of negative lookbehind/lookahead
123
+ pattern = r'\b' + re.escape(keyword.lower()) + r'\b'
124
+ treatment_matrix[:, i] = df['clean_text_lower'].str.contains(pattern, regex=True, na=False)
125
+ matches = treatment_matrix[:, i].sum()
126
+ print(f" - {keyword}: {matches} matches")
127
+
128
+ # Compute co-occurrence matrix
129
+ print("\n Computing co-occurrence matrix...")
130
+ cooc_matrix = emergency_matrix.astype(int).T @ treatment_matrix.astype(int)
131
+ print(" Computation completed successfully")
132
+
133
+ # Extract results
134
+ print(" Extracting co-occurrence pairs...")
135
+ cooccurrence_pairs = []
136
+ for i, em_kw in enumerate(emergency_keywords):
137
+ for j, tr_kw in enumerate(treatment_keywords):
138
+ count = int(cooc_matrix[i, j])
139
+ if count > 0:
140
+ cooccurrence_pairs.append({
141
+ 'emergency_keyword': em_kw,
142
+ 'treatment_keyword': tr_kw,
143
+ 'cooccurrence_count': count,
144
+ 'percentage': float(count / len(df) * 100)
145
+ })
146
+
147
+ # Sort and store results
148
+ cooccurrence_pairs.sort(key=lambda x: x['cooccurrence_count'], reverse=True)
149
+ stats['cooccurrence_analysis'] = cooccurrence_pairs[:20] # Top 20 pairs
150
+
151
+ print(f" Found {len(cooccurrence_pairs)} co-occurrence pairs")
152
+ print(" Top 5 co-occurrence pairs:")
153
+ for i, pair in enumerate(cooccurrence_pairs[:5]):
154
+ print(f" {i+1}. {pair['emergency_keyword']} + {pair['treatment_keyword']}: {pair['cooccurrence_count']} ({pair['percentage']:.1f}%)")
155
+
156
+ # Step 7: Path B validation metrics
157
+ print("\n7️⃣ Validating Path B strategy effectiveness...")
158
+
159
+ # Compute keyword density with progress bar
160
+ print(" Computing keyword density...")
161
+ with tqdm(total=2, desc="Density calculation") as pbar:
162
+ # Calculate density per 1000 words for both emergency and treatment keywords
163
+ emergency_density = calculate_density(
164
+ emergency_matrix.sum(axis=1),
165
+ df['text_length']
166
+ )
167
+ pbar.update(1)
168
+
169
+ treatment_density = calculate_density(
170
+ treatment_matrix.sum(axis=1),
171
+ df['text_length']
172
+ )
173
+ pbar.update(1)
174
+
175
+ # Store density in dataframe for visualization
176
+ df['emergency_keyword_density'] = emergency_density
177
+ df['treatment_keyword_density'] = treatment_density
178
+
179
+ # Calculate statistics with the new density metrics
180
+ stats['path_b_validation'] = {
181
+ 'avg_emergency_density': float(np.mean(emergency_density)),
182
+ 'avg_treatment_density': float(np.mean(treatment_density)),
183
+ 'high_density_records': int(sum(
184
+ (emergency_density >= np.percentile(emergency_density, 75)) &
185
+ (treatment_density >= np.percentile(treatment_density, 75))
186
+ )),
187
+ 'precision_estimate': float(sum(
188
+ (emergency_density > 0) & (treatment_density > 0)
189
+ ) / len(df))
190
+ }
191
+
192
+ # Print detailed results
193
+ print("\n Results:")
194
+ print(f" - Average emergency keyword density (per 1000 words): {stats['path_b_validation']['avg_emergency_density']:.2f}")
195
+ print(f" - Average treatment keyword density (per 1000 words): {stats['path_b_validation']['avg_treatment_density']:.2f}")
196
+ print(f" - High-density records (top 25% in both): {stats['path_b_validation']['high_density_records']}")
197
+ print(f" - Precision estimate: {stats['path_b_validation']['precision_estimate']:.2f}")
198
+
199
+ # Sample distribution analysis
200
+ print("\n Density Distribution:")
201
+ density_counts = pd.DataFrame({
202
+ 'emergency': pd.qcut(emergency_density, q=4, labels=['Low', 'Medium-Low', 'Medium-High', 'High']),
203
+ 'treatment': pd.qcut(treatment_density, q=4, labels=['Low', 'Medium-Low', 'Medium-High', 'High'])
204
+ }).value_counts().head()
205
+ print(" Top 5 density combinations (emergency, treatment):")
206
+ for (em, tr), count in density_counts.items():
207
+ print(f" - {count} documents have {em} emergency and {tr} treatment density")
208
+
209
+ # Visualization
210
+ print("\n8️⃣ Generating visualizations...")
211
+ output_plots = output_dir / "plots"
212
+ output_plots.mkdir(parents=True, exist_ok=True)
213
+
214
+ # 1. Keyword density scatter plot with improved visualization
215
+ plt.figure(figsize=(12, 8))
216
+ plt.scatter(
217
+ emergency_density,
218
+ treatment_density,
219
+ alpha=0.6,
220
+ c=np.log1p(df['text_length']), # Color by log text length
221
+ cmap='viridis'
222
+ )
223
+ plt.colorbar(label='Log Text Length')
224
+ plt.xlabel('Emergency Keyword Density (per 1000 words)')
225
+ plt.ylabel('Treatment Keyword Density (per 1000 words)')
226
+ plt.title('Emergency vs Treatment Keyword Density')
227
+ plt.grid(True, alpha=0.3)
228
+
229
+ # Add mean lines
230
+ plt.axvline(x=np.mean(emergency_density), color='r', linestyle='--', alpha=0.5, label='Mean Emergency Density')
231
+ plt.axhline(y=np.mean(treatment_density), color='g', linestyle='--', alpha=0.5, label='Mean Treatment Density')
232
+ plt.legend()
233
+
234
+ plt.savefig(output_plots / "keyword_density_scatter.png", bbox_inches='tight', dpi=300)
235
+ plt.close()
236
+
237
+ # Save comprehensive statistics
238
+ print("\n9️⃣ Saving analysis results...")
239
+ stats_dir = output_dir / "stats"
240
+ stats_dir.mkdir(parents=True, exist_ok=True)
241
+
242
+ with open(stats_dir / "treatment_analysis_comprehensive.json", 'w', encoding='utf-8') as f:
243
+ json.dump(stats, f, indent=2, ensure_ascii=False)
244
+
245
+ print(f"✅ Treatment subset analysis complete!")
246
+ print(f" Results saved to: {output_dir}")
247
+ print(f" Plots: {output_plots}")
248
+ print(f" Statistics: {stats_dir}")
249
+
250
+ return stats
251
+
252
+ if __name__ == "__main__":
253
+ # Configuration
254
+ treatment_file = "../dataset/emergency_treatment/emergency_treatment_subset.csv"
255
+ emergency_keywords = "../keywords/emergency_keywords.txt"
256
+ treatment_keywords = "../keywords/treatment_keywords.txt"
257
+ output_directory = "../analysis_treatment"
258
+
259
+ # Run analysis
260
+ results = analyze_treatment_subset(
261
+ treatment_file,
262
+ emergency_keywords,
263
+ treatment_keywords,
264
+ output_directory
265
+ )
dataset/scripts/data_explorer_treatment_opt.py ADDED
@@ -0,0 +1,262 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # /scripts/data_explorer_treatment_opt.py
2
+ import pandas as pd
3
+ import matplotlib.pyplot as plt
4
+ import seaborn as sns
5
+ import numpy as np
6
+ from pathlib import Path
7
+ import json
8
+ from tqdm import tqdm
9
+ import re
10
+
11
+ def calculate_density(matches, text_length):
12
+ """
13
+ Calculate keyword density per 1000 words
14
+
15
+ Args:
16
+ matches: Number of keyword matches
17
+ text_length: Total text length
18
+
19
+ Returns:
20
+ float: Density per 1000 words
21
+ """
22
+ return (matches / text_length) * 1000
23
+
24
+ def analyze_treatment_subset(
25
+ treatment_file_path,
26
+ emergency_keywords_path,
27
+ treatment_keywords_path,
28
+ output_dir="analysis_treatment_opt" # Updated default output directory
29
+ ):
30
+ """
31
+ Specialized analysis for optimized treatment subset focusing on:
32
+ 1. Dual keyword analysis (emergency + treatment)
33
+ 2. Path B effectiveness validation
34
+ 3. Condition mapping data preparation
35
+ 4. RAG readiness assessment
36
+ """
37
+ print(f"\n{'='*60}")
38
+ print(f"Treatment Subset Analysis (Optimized Version)")
39
+ print(f"Treatment file: {treatment_file_path}")
40
+ print(f"Emergency keywords: {emergency_keywords_path}")
41
+ print(f"Treatment keywords: {treatment_keywords_path}")
42
+ print(f"Output directory: {output_dir}")
43
+ print(f"{'='*60}\n")
44
+
45
+ # Load data
46
+ print("1️⃣ Loading optimized treatment subset data...")
47
+ df = pd.read_csv(treatment_file_path)
48
+ output_dir = Path(output_dir)
49
+
50
+ # Load keyword lists
51
+ print("2️⃣ Loading keyword lists...")
52
+ with open(emergency_keywords_path, 'r', encoding='utf-8') as f:
53
+ emergency_keywords = [line.strip() for line in f if line.strip()]
54
+
55
+ with open(treatment_keywords_path, 'r', encoding='utf-8') as f:
56
+ treatment_keywords = [line.strip() for line in f if line.strip()]
57
+
58
+ print(f" Emergency keywords: {len(emergency_keywords)}")
59
+ print(f" Treatment keywords: {len(treatment_keywords)}")
60
+
61
+ # Basic statistics
62
+ print("\n3️⃣ Computing basic statistics...")
63
+ total_records = len(df)
64
+ df['text_length'] = df['clean_text'].str.len()
65
+ avg_length = df['text_length'].mean()
66
+
67
+ print(f" Total treatment records: {total_records}")
68
+ print(f" Average text length: {avg_length:.2f} characters")
69
+
70
+ # Initialize comprehensive statistics
71
+ stats = {
72
+ 'basic_statistics': {
73
+ 'total_records': int(total_records),
74
+ 'avg_text_length': float(avg_length),
75
+ 'emergency_keywords_count': len(emergency_keywords),
76
+ 'treatment_keywords_count': len(treatment_keywords)
77
+ },
78
+ 'emergency_keyword_stats': {},
79
+ 'treatment_keyword_stats': {},
80
+ 'cooccurrence_analysis': {},
81
+ 'path_b_validation': {},
82
+ 'condition_mapping_candidates': {}
83
+ }
84
+
85
+ # Emergency keyword analysis in treatment subset
86
+ print("\n4️⃣ Analyzing emergency keywords in treatment subset...")
87
+ for keyword in emergency_keywords:
88
+ count = df['clean_text'].str.contains(keyword, case=False, na=False).sum()
89
+ stats['emergency_keyword_stats'][keyword] = int(count)
90
+ print(f" Emergency: {keyword} -> {count} records")
91
+
92
+ # Treatment keyword analysis
93
+ print("\n5️⃣ Analyzing treatment keywords...")
94
+ for keyword in treatment_keywords:
95
+ count = df['clean_text'].str.contains(keyword, case=False, na=False).sum()
96
+ stats['treatment_keyword_stats'][keyword] = int(count)
97
+ print(f" Treatment: {keyword} -> {count} records")
98
+
99
+ # Step 6: Co-occurrence analysis
100
+ print("\n6️⃣ Computing keyword co-occurrence patterns...")
101
+
102
+ # Initialize matrices for full dataset
103
+ emergency_matrix = np.zeros((len(df), len(emergency_keywords)), dtype=bool)
104
+ treatment_matrix = np.zeros((len(df), len(treatment_keywords)), dtype=bool)
105
+
106
+ # Pre-process text
107
+ print(" Pre-processing text...")
108
+ df['clean_text_lower'] = df['clean_text'].fillna('').str.lower()
109
+
110
+ # Process all emergency keywords
111
+ print("\n Processing all emergency keywords...")
112
+ for i, keyword in enumerate(tqdm(emergency_keywords, desc="Emergency keywords")):
113
+ pattern = r'\b' + re.escape(keyword.lower()) + r'\b'
114
+ emergency_matrix[:, i] = df['clean_text_lower'].str.contains(pattern, regex=True, na=False)
115
+ matches = emergency_matrix[:, i].sum()
116
+ print(f" - {keyword}: {matches} matches")
117
+
118
+ # Process all treatment keywords
119
+ print("\n Processing all treatment keywords...")
120
+ for i, keyword in enumerate(tqdm(treatment_keywords, desc="Treatment keywords")):
121
+ pattern = r'\b' + re.escape(keyword.lower()) + r'\b'
122
+ treatment_matrix[:, i] = df['clean_text_lower'].str.contains(pattern, regex=True, na=False)
123
+ matches = treatment_matrix[:, i].sum()
124
+ print(f" - {keyword}: {matches} matches")
125
+
126
+ # Compute co-occurrence matrix
127
+ print("\n Computing co-occurrence matrix...")
128
+ cooc_matrix = emergency_matrix.astype(int).T @ treatment_matrix.astype(int)
129
+ print(" Computation completed successfully")
130
+
131
+ # Extract results
132
+ print(" Extracting co-occurrence pairs...")
133
+ cooccurrence_pairs = []
134
+ for i, em_kw in enumerate(emergency_keywords):
135
+ for j, tr_kw in enumerate(treatment_keywords):
136
+ count = int(cooc_matrix[i, j])
137
+ if count > 0:
138
+ cooccurrence_pairs.append({
139
+ 'emergency_keyword': em_kw,
140
+ 'treatment_keyword': tr_kw,
141
+ 'cooccurrence_count': count,
142
+ 'percentage': float(count / len(df) * 100)
143
+ })
144
+
145
+ # Sort and store results
146
+ cooccurrence_pairs.sort(key=lambda x: x['cooccurrence_count'], reverse=True)
147
+ stats['cooccurrence_analysis'] = cooccurrence_pairs[:20] # Top 20 pairs
148
+
149
+ print(f" Found {len(cooccurrence_pairs)} co-occurrence pairs")
150
+ print(" Top 5 co-occurrence pairs:")
151
+ for i, pair in enumerate(cooccurrence_pairs[:5]):
152
+ print(f" {i+1}. {pair['emergency_keyword']} + {pair['treatment_keyword']}: {pair['cooccurrence_count']} ({pair['percentage']:.1f}%)")
153
+
154
+ # Step 7: Path B validation metrics
155
+ print("\n7️⃣ Validating Path B strategy effectiveness...")
156
+
157
+ # Compute keyword density with progress bar
158
+ print(" Computing keyword density...")
159
+ with tqdm(total=2, desc="Density calculation") as pbar:
160
+ emergency_density = calculate_density(
161
+ emergency_matrix.sum(axis=1),
162
+ df['text_length']
163
+ )
164
+ pbar.update(1)
165
+
166
+ treatment_density = calculate_density(
167
+ treatment_matrix.sum(axis=1),
168
+ df['text_length']
169
+ )
170
+ pbar.update(1)
171
+
172
+ # Store density in dataframe for visualization
173
+ df['emergency_keyword_density'] = emergency_density
174
+ df['treatment_keyword_density'] = treatment_density
175
+
176
+ # Calculate statistics with the new density metrics
177
+ stats['path_b_validation'] = {
178
+ 'avg_emergency_density': float(np.mean(emergency_density)),
179
+ 'avg_treatment_density': float(np.mean(treatment_density)),
180
+ 'high_density_records': int(sum(
181
+ (emergency_density >= np.percentile(emergency_density, 75)) &
182
+ (treatment_density >= np.percentile(treatment_density, 75))
183
+ )),
184
+ 'precision_estimate': float(sum(
185
+ (emergency_density > 0) & (treatment_density > 0)
186
+ ) / len(df))
187
+ }
188
+
189
+ # Print detailed results
190
+ print("\n Results:")
191
+ print(f" - Average emergency keyword density (per 1000 words): {stats['path_b_validation']['avg_emergency_density']:.2f}")
192
+ print(f" - Average treatment keyword density (per 1000 words): {stats['path_b_validation']['avg_treatment_density']:.2f}")
193
+ print(f" - High-density records (top 25% in both): {stats['path_b_validation']['high_density_records']}")
194
+ print(f" - Precision estimate: {stats['path_b_validation']['precision_estimate']:.2f}")
195
+
196
+ # Sample distribution analysis
197
+ print("\n Density Distribution:")
198
+ density_counts = pd.DataFrame({
199
+ 'emergency': pd.qcut(emergency_density, q=4, labels=['Low', 'Medium-Low', 'Medium-High', 'High']),
200
+ 'treatment': pd.qcut(treatment_density, q=4, labels=['Low', 'Medium-Low', 'Medium-High', 'High'])
201
+ }).value_counts().head()
202
+ print(" Top 5 density combinations (emergency, treatment):")
203
+ for (em, tr), count in density_counts.items():
204
+ print(f" - {count} documents have {em} emergency and {tr} treatment density")
205
+
206
+ # Visualization
207
+ print("\n8️⃣ Generating visualizations...")
208
+ output_plots = output_dir / "plots"
209
+ output_plots.mkdir(parents=True, exist_ok=True)
210
+
211
+ # 1. Keyword density scatter plot with improved visualization
212
+ plt.figure(figsize=(12, 8))
213
+ plt.scatter(
214
+ emergency_density,
215
+ treatment_density,
216
+ alpha=0.6,
217
+ c=np.log1p(df['text_length']),
218
+ cmap='viridis'
219
+ )
220
+ plt.colorbar(label='Log Text Length')
221
+ plt.xlabel('Emergency Keyword Density (per 1000 words)')
222
+ plt.ylabel('Treatment Keyword Density (per 1000 words)')
223
+ plt.title('Emergency vs Treatment Keyword Density (Optimized)')
224
+ plt.grid(True, alpha=0.3)
225
+
226
+ # Add mean lines
227
+ plt.axvline(x=np.mean(emergency_density), color='r', linestyle='--', alpha=0.5, label='Mean Emergency Density')
228
+ plt.axhline(y=np.mean(treatment_density), color='g', linestyle='--', alpha=0.5, label='Mean Treatment Density')
229
+ plt.legend()
230
+
231
+ plt.savefig(output_plots / "keyword_density_scatter_opt.png", bbox_inches='tight', dpi=300)
232
+ plt.close()
233
+
234
+ # Save comprehensive statistics
235
+ print("\n9️⃣ Saving analysis results...")
236
+ stats_dir = output_dir / "stats"
237
+ stats_dir.mkdir(parents=True, exist_ok=True)
238
+
239
+ with open(stats_dir / "treatment_analysis_comprehensive_opt.json", 'w', encoding='utf-8') as f:
240
+ json.dump(stats, f, indent=2, ensure_ascii=False)
241
+
242
+ print(f"✅ Treatment subset analysis complete! (Optimized Version)")
243
+ print(f" Results saved to: {output_dir}")
244
+ print(f" Plots: {output_plots}")
245
+ print(f" Statistics: {stats_dir}")
246
+
247
+ return stats
248
+
249
+ if __name__ == "__main__":
250
+ # Configuration for optimized version
251
+ treatment_file = "../dataset/emergency_treatment/emergency_treatment_subset_opt.csv"
252
+ emergency_keywords = "../keywords/emergency_keywords.txt"
253
+ treatment_keywords = "../keywords/treatment_keywords.txt"
254
+ output_directory = "../analysis_treatment_opt"
255
+
256
+ # Run analysis
257
+ results = analyze_treatment_subset(
258
+ treatment_file,
259
+ emergency_keywords,
260
+ treatment_keywords,
261
+ output_directory
262
+ )
dataset/scripts/keyword_Match_Clean_for_subset_filter.txt ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Keyword Matching and Text Cleaning Logic for Subset Filtering
2
+
3
+ ## 1. Keyword Preprocessing
4
+ ```python
5
+ def preprocess_keywords(keywords_file):
6
+ # Handle special medical term variants
7
+ special_terms = {
8
+ 'x-ray': ['x-ray', 'x ray', 'xray'],
9
+ 'ct-scan': ['ct-scan', 'ct scan', 'ctscan'],
10
+ 'point-of-care': ['point-of-care', 'point of care']
11
+ }
12
+
13
+ # Read and preprocess keywords
14
+ with open(keywords_file, "r", encoding="utf-8") as f:
15
+ keywords = [
16
+ line.strip() # Remove whitespace
17
+ .lower() # Convert to lowercase
18
+ for line in f
19
+ if line.strip()
20
+ ]
21
+
22
+ # Process special term variants
23
+ processed_keywords = []
24
+ for kw in keywords:
25
+ if kw in special_terms:
26
+ processed_keywords.extend(special_terms[kw])
27
+ else:
28
+ processed_keywords.append(kw)
29
+
30
+ return processed_keywords
31
+ ```
32
+
33
+ ## 2. Regex Pattern Processing
34
+ ```python
35
+ def create_regex_pattern(keywords):
36
+ # Simple word boundary matching
37
+ pattern = r"\b(?:" + "|".join(map(re.escape, keywords)) + r")\b"
38
+ return re.compile(pattern, re.IGNORECASE)
39
+ ```
40
+
41
+ ### Regex Pattern Explanation:
42
+ - `\b`: Word boundary matching
43
+ - `(?:...)`: Non-capturing group
44
+ - `re.escape()`: Escape special characters
45
+ - `re.IGNORECASE`: Case-insensitive matching
46
+
47
+ ## 3. Text Preprocessing and Matching
48
+ ```python
49
+ # Create lowercase version of text
50
+ df['clean_text_lower'] = df['clean_text'].fillna('').str.lower()
51
+
52
+ # Match keywords
53
+ df["treatment_matched"] = (
54
+ df["clean_text_lower"]
55
+ .apply(lambda text: "|".join(pattern.findall(text)) or "")
56
+ )
57
+ ```
58
+
59
+ ## 4. Processing Logic Details
60
+
61
+ ### 4.1 Special Term Handling Rationale
62
+ - Common variants in medical literature
63
+ - Maintain semantic consistency
64
+ - Improve matching accuracy
65
+
66
+ ### 4.2 Regex Matching Strategy
67
+ - Word boundary matching for complete terms
68
+ - Precompiled patterns for performance
69
+ - Case-insensitive matching for flexibility
70
+
71
+ ### 4.3 Text Preprocessing Steps
72
+ 1. Fill null values (fillna)
73
+ 2. Convert to lowercase (str.lower)
74
+ 3. Create dedicated lowercase column to avoid repeated conversions
75
+
76
+ ## 5. Output Format
77
+ - matched column: Pipe-separated matched keywords
78
+ - type column: Document type identifier ("emergency" or "treatment")
79
+ - condition column: Reserved for future condition mapping
80
+
81
+ ## 6. Important Considerations
82
+ 1. Regular maintenance required for special term variants
83
+ 2. Precompiled regex patterns for performance optimization
84
+ 3. Dedicated text preprocessing storage to avoid redundant computations
85
+ 4. Maintain consistent column structure between emergency and treatment subsets
dataset/scripts/test_keyword_matching.py ADDED
@@ -0,0 +1,175 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import re
3
+ from pathlib import Path
4
+ import json
5
+
6
+ def test_special_terms_matching():
7
+ """
8
+ Test special medical term matching logic
9
+ """
10
+ # Test cases for different scenarios
11
+ test_cases = {
12
+ "x-ray variants": [
13
+ "Patient needs an x-ray of the chest",
14
+ "Ordered chest xray",
15
+ "X ray shows pneumonia",
16
+ "XRAY negative"
17
+ ],
18
+ "ct-scan variants": [
19
+ "CT scan reveals nodule",
20
+ "CT-scan indicates mass",
21
+ "Requires ctscan urgently",
22
+ "CTSCAN of abdomen"
23
+ ],
24
+ "point-of-care variants": [
25
+ "Point-of-care testing needed",
26
+ "Point of care ultrasound",
27
+ "POC testing results"
28
+ ],
29
+ "mixed cases": [
30
+ "Ordered both x-ray and CT scan",
31
+ "XRAY and CTSCAN negative",
32
+ "Multiple point-of-care tests with x-ray"
33
+ ],
34
+ "negative cases": [
35
+ "No imaging mentioned",
36
+ "Regular examination only",
37
+ "Laboratory tests pending"
38
+ ]
39
+ }
40
+
41
+ # Special terms dictionary (from keyword_Match_Clean_for_subset_filter.txt)
42
+ special_terms = {
43
+ 'x-ray': ['x-ray', 'x ray', 'xray'],
44
+ 'ct-scan': ['ct-scan', 'ct scan', 'ctscan'],
45
+ 'point-of-care': ['point-of-care', 'point of care']
46
+ }
47
+
48
+ # Create test DataFrame
49
+ test_df = pd.DataFrame({
50
+ 'clean_text': [text for cases in test_cases.values() for text in cases],
51
+ 'category': [cat for cat, texts in test_cases.items() for _ in texts]
52
+ })
53
+
54
+ # Process keywords
55
+ processed_keywords = []
56
+ for term, variants in special_terms.items():
57
+ processed_keywords.extend(variants)
58
+
59
+ # Create regex pattern
60
+ pattern = r"\b(?:" + "|".join(map(re.escape, processed_keywords)) + r")\b"
61
+
62
+ # Apply matching logic
63
+ test_df['matched'] = (
64
+ test_df['clean_text']
65
+ .fillna("")
66
+ .str.findall(pattern, flags=re.IGNORECASE)
67
+ .apply(lambda lst: "|".join(lst) if lst else "")
68
+ )
69
+
70
+ return test_df
71
+
72
+ def test_basic_matching():
73
+ """
74
+ Test basic keyword matching functionality
75
+ """
76
+ # Basic test cases
77
+ test_cases = {
78
+ "simple matches": [
79
+ "Emergency treatment required",
80
+ "Acute condition observed",
81
+ "Urgent care needed"
82
+ ],
83
+ "case variations": [
84
+ "EMERGENCY situation",
85
+ "Acute RESPIRATORY failure",
86
+ "URgent surgical intervention"
87
+ ],
88
+ "multiple matches": [
89
+ "Emergency treatment for acute condition",
90
+ "Urgent care in emergency department",
91
+ "Acute respiratory emergency"
92
+ ],
93
+ "partial words": [
94
+ "Non-emergency situation",
95
+ "Subacute condition",
96
+ "Emergency-related"
97
+ ]
98
+ }
99
+
100
+ # Create test DataFrame
101
+ test_df = pd.DataFrame({
102
+ 'clean_text': [text for cases in test_cases.values() for text in cases],
103
+ 'category': [cat for cat, texts in test_cases.items() for _ in texts]
104
+ })
105
+
106
+ # Test keywords
107
+ test_keywords = ['emergency', 'acute', 'urgent']
108
+ pattern = r"\b(?:" + "|".join(map(re.escape, test_keywords)) + r")\b"
109
+
110
+ # Apply matching logic
111
+ test_df['matched'] = (
112
+ test_df['clean_text']
113
+ .fillna("")
114
+ .str.findall(pattern, flags=re.IGNORECASE)
115
+ .apply(lambda lst: "|".join(lst) if lst else "")
116
+ )
117
+
118
+ return test_df
119
+
120
+ def save_test_results(results_dict):
121
+ """
122
+ Save test results to JSON file
123
+ """
124
+ output_dir = Path("../analysis")
125
+ output_dir.mkdir(exist_ok=True)
126
+
127
+ output_file = output_dir / "keyword_matching_test_results.json"
128
+
129
+ # Convert DataFrame results to dictionary
130
+ for key, df in results_dict.items():
131
+ results_dict[key] = df.to_dict(orient='records')
132
+
133
+ with open(output_file, 'w') as f:
134
+ json.dump(results_dict, f, indent=2)
135
+
136
+ print(f"Results saved to: {output_file}")
137
+
138
+ def run_tests():
139
+ """
140
+ Run all tests and output results
141
+ """
142
+ print("🧪 Running keyword matching tests...")
143
+
144
+ # Run tests
145
+ special_terms_results = test_special_terms_matching()
146
+ basic_matching_results = test_basic_matching()
147
+
148
+ # Print results
149
+ print("\n📊 Special Terms Matching Results:")
150
+ for category in special_terms_results['category'].unique():
151
+ print(f"\n{category}:")
152
+ subset = special_terms_results[special_terms_results['category'] == category]
153
+ for _, row in subset.iterrows():
154
+ print(f"Text: {row['clean_text']}")
155
+ print(f"Matched: {row['matched'] or 'No matches'}")
156
+ print("-" * 50)
157
+
158
+ print("\n📊 Basic Matching Results:")
159
+ for category in basic_matching_results['category'].unique():
160
+ print(f"\n{category}:")
161
+ subset = basic_matching_results[basic_matching_results['category'] == category]
162
+ for _, row in subset.iterrows():
163
+ print(f"Text: {row['clean_text']}")
164
+ print(f"Matched: {row['matched'] or 'No matches'}")
165
+ print("-" * 50)
166
+
167
+ # Save results
168
+ results_dict = {
169
+ 'special_terms_matching': special_terms_results,
170
+ 'basic_matching': basic_matching_results
171
+ }
172
+ save_test_results(results_dict)
173
+
174
+ if __name__ == "__main__":
175
+ run_tests()
requirements.txt CHANGED
@@ -10,12 +10,15 @@ Brotli==1.1.0
10
  certifi==2025.7.14
11
  charset-normalizer==3.4.2
12
  click==8.2.1
 
 
13
  datasets==4.0.0
14
  dill==0.3.8
15
  distro==1.9.0
16
  fastapi==0.116.1
17
  ffmpy==0.6.0
18
  filelock==3.18.0
 
19
  frozenlist==1.7.0
20
  fsspec==2025.3.0
21
  gradio==5.38.0
@@ -29,8 +32,10 @@ huggingface-hub==0.33.4
29
  idna==3.10
30
  Jinja2==3.1.6
31
  jiter==0.10.0
 
32
  markdown-it-py==3.0.0
33
  MarkupSafe==3.0.2
 
34
  mdurl==0.1.2
35
  multidict==6.6.3
36
  multiprocess==0.70.16
@@ -46,6 +51,7 @@ pydantic==2.11.7
46
  pydantic_core==2.33.2
47
  pydub==0.25.1
48
  Pygments==2.19.2
 
49
  python-dateutil==2.9.0.post0
50
  python-multipart==0.0.20
51
  pytz==2025.2
@@ -56,6 +62,7 @@ rich==14.0.0
56
  ruff==0.12.4
57
  safehttpx==0.1.6
58
  safetensors==0.5.3
 
59
  semantic-version==2.10.0
60
  shellingham==1.5.4
61
  six==1.17.0
 
10
  certifi==2025.7.14
11
  charset-normalizer==3.4.2
12
  click==8.2.1
13
+ contourpy==1.3.2
14
+ cycler==0.12.1
15
  datasets==4.0.0
16
  dill==0.3.8
17
  distro==1.9.0
18
  fastapi==0.116.1
19
  ffmpy==0.6.0
20
  filelock==3.18.0
21
+ fonttools==4.59.0
22
  frozenlist==1.7.0
23
  fsspec==2025.3.0
24
  gradio==5.38.0
 
32
  idna==3.10
33
  Jinja2==3.1.6
34
  jiter==0.10.0
35
+ kiwisolver==1.4.8
36
  markdown-it-py==3.0.0
37
  MarkupSafe==3.0.2
38
+ matplotlib==3.10.3
39
  mdurl==0.1.2
40
  multidict==6.6.3
41
  multiprocess==0.70.16
 
51
  pydantic_core==2.33.2
52
  pydub==0.25.1
53
  Pygments==2.19.2
54
+ pyparsing==3.2.3
55
  python-dateutil==2.9.0.post0
56
  python-multipart==0.0.20
57
  pytz==2025.2
 
62
  ruff==0.12.4
63
  safehttpx==0.1.6
64
  safetensors==0.5.3
65
+ seaborn==0.13.2
66
  semantic-version==2.10.0
67
  shellingham==1.5.4
68
  six==1.17.0