Spaces:
Sleeping
Sleeping
YanBoChen
commited on
Commit
·
654aa66
1
Parent(s):
04a03be
feat: update treatment analysis with keyword density calculations and enhanced visualization(test previous 2 dataset, especially treatment_subset)
Browse files
dataset/analysis_treatment/stats/treatment_analysis_comprehensive.json
CHANGED
@@ -3,7 +3,7 @@
|
|
3 |
"total_records": 9367,
|
4 |
"avg_text_length": 27179.22952919825,
|
5 |
"emergency_keywords_count": 47,
|
6 |
-
"treatment_keywords_count":
|
7 |
},
|
8 |
"emergency_keyword_stats": {
|
9 |
"Acute abdomen": 51,
|
@@ -55,122 +55,239 @@
|
|
55 |
"Ventricular fibrillation": 280
|
56 |
},
|
57 |
"treatment_keyword_stats": {
|
58 |
-
"
|
59 |
-
"
|
60 |
-
"
|
61 |
-
"
|
62 |
-
"crystalloids": 45,
|
63 |
-
"vasopressors": 188,
|
64 |
-
"Vasoconstrictor Agents": 2,
|
65 |
-
"Epinephrine": 806,
|
66 |
-
"Ondansetron": 43,
|
67 |
-
"Ibuprofen": 269,
|
68 |
-
"Morphine": 289,
|
69 |
-
"Lidocaine": 212,
|
70 |
"Airway Management": 174,
|
71 |
-
"
|
72 |
-
"
|
73 |
-
"
|
74 |
-
"
|
75 |
-
"
|
76 |
-
"
|
77 |
-
"
|
78 |
-
"
|
79 |
-
"bipap": 25,
|
80 |
"Bi-level Positive Airway Pressure": 6,
|
81 |
-
"
|
82 |
-
"
|
|
|
83 |
"Cardiopulmonary Resuscitation": 131,
|
84 |
-
"ACLS": 30,
|
85 |
-
"Advanced Cardiac Life Support": 34,
|
86 |
-
"Defibrillation": 96,
|
87 |
"Cardioversion": 142,
|
88 |
-
"Blood Transfusion": 379,
|
89 |
-
"transfusion": 826,
|
90 |
-
"hemodynamic monitoring": 43,
|
91 |
-
"Hemodynamics": 135,
|
92 |
-
"central line placement": 6,
|
93 |
-
"Catheterization Central Venous": 0,
|
94 |
-
"arterial line placement": 0,
|
95 |
"Catheterization Arterial": 0,
|
96 |
-
"
|
97 |
-
"
|
98 |
-
"Wound Management": 37,
|
99 |
-
"Suturing": 53,
|
100 |
-
"Tourniquet": 56,
|
101 |
"compression dressing": 2,
|
102 |
-
"
|
103 |
-
"
|
104 |
-
"
|
105 |
-
"
|
106 |
-
"Radiography": 218,
|
107 |
-
"point-of-care ultrasound": 13,
|
108 |
-
"POCUS": 10,
|
109 |
-
"Ultrasonography Point-of-Care": 0,
|
110 |
-
"x-ray": 1293,
|
111 |
"ct scan": 1036,
|
112 |
-
"
|
113 |
-
"
|
114 |
-
"
|
115 |
-
"
|
116 |
-
"
|
117 |
-
"
|
118 |
-
"
|
119 |
-
"
|
120 |
-
"
|
121 |
-
"
|
122 |
-
"
|
123 |
-
"
|
124 |
-
"
|
125 |
-
"Supportive Care": 564,
|
126 |
-
"monitoring": 3593,
|
127 |
-
"Patient Monitoring": 107,
|
128 |
-
"vital signs monitoring": 1,
|
129 |
-
"Vital Signs": 459,
|
130 |
"icu transfer": 9,
|
131 |
-
"
|
132 |
-
"
|
133 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
134 |
"manage": 4416,
|
135 |
-
"Patient Management": 281,
|
136 |
"management": 4008,
|
137 |
-
"intervention": 2695,
|
138 |
-
"Therapeutic Intervention": 181,
|
139 |
-
"Therapy": 6117,
|
140 |
"medication": 4698,
|
141 |
-
"
|
142 |
-
"procedure": 3073,
|
143 |
-
"Surgical Procedures Operative": 0,
|
144 |
-
"resuscitation": 539,
|
145 |
-
"administer": 3881,
|
146 |
-
"Drug Administration Routes": 0,
|
147 |
-
"dose": 5344,
|
148 |
-
"Dosage Forms": 210,
|
149 |
"monitor": 4521,
|
150 |
-
"
|
151 |
-
"
|
152 |
-
"
|
153 |
-
"Surgical Procedures": 482,
|
154 |
-
"antibiotic": 1922,
|
155 |
-
"Anti-Bacterial Agents": 1,
|
156 |
-
"Dopamine": 389,
|
157 |
-
"Amiodarone": 315,
|
158 |
-
"levophed": 11,
|
159 |
-
"Norepinephrine": 392,
|
160 |
-
"Bosmin": 0,
|
161 |
-
"Adrenaline": 135,
|
162 |
-
"Insulin": 808,
|
163 |
"nitroglycerin": 125,
|
164 |
"NTG": 81,
|
165 |
-
"
|
166 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
167 |
},
|
168 |
-
"cooccurrence_analysis": [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
169 |
"path_b_validation": {
|
170 |
-
"avg_emergency_density": 0.
|
171 |
-
"avg_treatment_density":
|
172 |
-
"high_density_records":
|
173 |
-
"precision_estimate": 0.
|
174 |
},
|
175 |
"condition_mapping_candidates": {}
|
176 |
}
|
|
|
3 |
"total_records": 9367,
|
4 |
"avg_text_length": 27179.22952919825,
|
5 |
"emergency_keywords_count": 47,
|
6 |
+
"treatment_keywords_count": 105
|
7 |
},
|
8 |
"emergency_keyword_stats": {
|
9 |
"Acute abdomen": 51,
|
|
|
55 |
"Ventricular fibrillation": 280
|
56 |
},
|
57 |
"treatment_keyword_stats": {
|
58 |
+
"ACLS": 30,
|
59 |
+
"administer": 3881,
|
60 |
+
"Adrenaline": 135,
|
61 |
+
"Advanced Cardiac Life Support": 34,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
62 |
"Airway Management": 174,
|
63 |
+
"alpha blocker": 35,
|
64 |
+
"Amiodarone": 315,
|
65 |
+
"analgesia": 323,
|
66 |
+
"Anesthesia Procedural": 0,
|
67 |
+
"Anti-Bacterial Agents": 1,
|
68 |
+
"antibiotic": 1922,
|
69 |
+
"arterial line placement": 0,
|
70 |
+
"beta blocker": 297,
|
|
|
71 |
"Bi-level Positive Airway Pressure": 6,
|
72 |
+
"bipap": 25,
|
73 |
+
"Blood Transfusion": 379,
|
74 |
+
"Bosmin": 0,
|
75 |
"Cardiopulmonary Resuscitation": 131,
|
|
|
|
|
|
|
76 |
"Cardioversion": 142,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
77 |
"Catheterization Arterial": 0,
|
78 |
+
"Catheterization Central Venous": 0,
|
79 |
+
"central line placement": 6,
|
|
|
|
|
|
|
80 |
"compression dressing": 2,
|
81 |
+
"Computed Tomography": 518,
|
82 |
+
"cpap": 84,
|
83 |
+
"cpr": 151,
|
84 |
+
"crystalloids": 45,
|
|
|
|
|
|
|
|
|
|
|
85 |
"ct scan": 1036,
|
86 |
+
"Defibrillation": 96,
|
87 |
+
"Dopamine": 389,
|
88 |
+
"Dosage Forms": 210,
|
89 |
+
"dose": 5344,
|
90 |
+
"Drug Administration Routes": 0,
|
91 |
+
"Drug Therapy": 773,
|
92 |
+
"Epinephrine": 806,
|
93 |
+
"fluid": 2938,
|
94 |
+
"fluid resuscitation": 115,
|
95 |
+
"hemodynamic monitoring": 43,
|
96 |
+
"Hemodynamics": 135,
|
97 |
+
"Hemostasis": 180,
|
98 |
+
"Ibuprofen": 269,
|
|
|
|
|
|
|
|
|
|
|
99 |
"icu transfer": 9,
|
100 |
+
"Insulin": 808,
|
101 |
+
"intervention": 2695,
|
102 |
+
"intubation": 493,
|
103 |
+
"Intratracheal Intubation": 3,
|
104 |
+
"Intravenous Infusion": 576,
|
105 |
+
"iv fluids": 75,
|
106 |
+
"laboratory techniques": 29,
|
107 |
+
"laboratory testing": 296,
|
108 |
+
"levophed": 11,
|
109 |
+
"Lidocaine": 212,
|
110 |
"manage": 4416,
|
|
|
111 |
"management": 4008,
|
|
|
|
|
|
|
112 |
"medication": 4698,
|
113 |
+
"midazolam": 204,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
114 |
"monitor": 4521,
|
115 |
+
"monitoring": 3593,
|
116 |
+
"Morphine": 289,
|
117 |
+
"Nebulization": 41,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
118 |
"nitroglycerin": 125,
|
119 |
"NTG": 81,
|
120 |
+
"Norepinephrine": 392,
|
121 |
+
"normal saline": 252,
|
122 |
+
"Ondansetron": 43,
|
123 |
+
"Oxygen": 1779,
|
124 |
+
"Oxygen Inhalation Therapy": 2,
|
125 |
+
"oxygen therapy": 178,
|
126 |
+
"Patient Management": 281,
|
127 |
+
"Patient Monitoring": 107,
|
128 |
+
"POCUS": 10,
|
129 |
+
"point of care ultrasound": 2,
|
130 |
+
"procedural sedation": 26,
|
131 |
+
"procedure": 3073,
|
132 |
+
"radiologic imaging": 5,
|
133 |
+
"Radiography": 218,
|
134 |
+
"resuscitation": 539,
|
135 |
+
"Sedation": 602,
|
136 |
+
"splinting": 26,
|
137 |
+
"Splints": 29,
|
138 |
+
"supportive care": 564,
|
139 |
+
"surgical procedures": 482,
|
140 |
+
"Surgical Procedures Operative": 0,
|
141 |
+
"surgery": 3531,
|
142 |
+
"Suture": 179,
|
143 |
+
"Suturing": 53,
|
144 |
+
"Therapeutic Intervention": 181,
|
145 |
+
"Therapeutics": 182,
|
146 |
+
"Therapy": 6117,
|
147 |
+
"tourniquet": 56,
|
148 |
+
"transfusion": 826,
|
149 |
+
"treat": 8270,
|
150 |
+
"treatment": 7719,
|
151 |
+
"Ultrasonography Point of Care": 0,
|
152 |
+
"ultrasound": 1273,
|
153 |
+
"Vasoconstrictor Agents": 2,
|
154 |
+
"vasopressors": 188,
|
155 |
+
"ventilation support": 14,
|
156 |
+
"Ventilators": 86,
|
157 |
+
"Vital Signs": 459,
|
158 |
+
"vital signs monitoring": 1,
|
159 |
+
"wound care": 73,
|
160 |
+
"Wound Dressing": 30,
|
161 |
+
"Wound Management": 37,
|
162 |
+
"X-Ray": 1293
|
163 |
},
|
164 |
+
"cooccurrence_analysis": [
|
165 |
+
{
|
166 |
+
"emergency_keyword": "Fever",
|
167 |
+
"treatment_keyword": "treatment",
|
168 |
+
"cooccurrence_count": 3488,
|
169 |
+
"percentage": 37.23710899967973
|
170 |
+
},
|
171 |
+
{
|
172 |
+
"emergency_keyword": "Fever",
|
173 |
+
"treatment_keyword": "Therapy",
|
174 |
+
"cooccurrence_count": 2698,
|
175 |
+
"percentage": 28.803245436105477
|
176 |
+
},
|
177 |
+
{
|
178 |
+
"emergency_keyword": "Fever",
|
179 |
+
"treatment_keyword": "dose",
|
180 |
+
"cooccurrence_count": 2430,
|
181 |
+
"percentage": 25.94213729048788
|
182 |
+
},
|
183 |
+
{
|
184 |
+
"emergency_keyword": "Fever",
|
185 |
+
"treatment_keyword": "medication",
|
186 |
+
"cooccurrence_count": 1979,
|
187 |
+
"percentage": 21.127362015586634
|
188 |
+
},
|
189 |
+
{
|
190 |
+
"emergency_keyword": "Hypotension",
|
191 |
+
"treatment_keyword": "treatment",
|
192 |
+
"cooccurrence_count": 1760,
|
193 |
+
"percentage": 18.789366926443897
|
194 |
+
},
|
195 |
+
{
|
196 |
+
"emergency_keyword": "Fever",
|
197 |
+
"treatment_keyword": "management",
|
198 |
+
"cooccurrence_count": 1753,
|
199 |
+
"percentage": 18.714636489804633
|
200 |
+
},
|
201 |
+
{
|
202 |
+
"emergency_keyword": "Fever",
|
203 |
+
"treatment_keyword": "treat",
|
204 |
+
"cooccurrence_count": 1744,
|
205 |
+
"percentage": 18.618554499839863
|
206 |
+
},
|
207 |
+
{
|
208 |
+
"emergency_keyword": "Fever",
|
209 |
+
"treatment_keyword": "monitoring",
|
210 |
+
"cooccurrence_count": 1674,
|
211 |
+
"percentage": 17.87125013344721
|
212 |
+
},
|
213 |
+
{
|
214 |
+
"emergency_keyword": "Hypotension",
|
215 |
+
"treatment_keyword": "Therapy",
|
216 |
+
"cooccurrence_count": 1558,
|
217 |
+
"percentage": 16.63286004056795
|
218 |
+
},
|
219 |
+
{
|
220 |
+
"emergency_keyword": "Fever",
|
221 |
+
"treatment_keyword": "surgery",
|
222 |
+
"cooccurrence_count": 1505,
|
223 |
+
"percentage": 16.06704387744208
|
224 |
+
},
|
225 |
+
{
|
226 |
+
"emergency_keyword": "Tachycardia",
|
227 |
+
"treatment_keyword": "treatment",
|
228 |
+
"cooccurrence_count": 1441,
|
229 |
+
"percentage": 15.383794171025942
|
230 |
+
},
|
231 |
+
{
|
232 |
+
"emergency_keyword": "Hypotension",
|
233 |
+
"treatment_keyword": "dose",
|
234 |
+
"cooccurrence_count": 1423,
|
235 |
+
"percentage": 15.191630191096403
|
236 |
+
},
|
237 |
+
{
|
238 |
+
"emergency_keyword": "Myocardial Infarction",
|
239 |
+
"treatment_keyword": "treatment",
|
240 |
+
"cooccurrence_count": 1369,
|
241 |
+
"percentage": 14.615138251307783
|
242 |
+
},
|
243 |
+
{
|
244 |
+
"emergency_keyword": "Shock",
|
245 |
+
"treatment_keyword": "treatment",
|
246 |
+
"cooccurrence_count": 1340,
|
247 |
+
"percentage": 14.305540728087967
|
248 |
+
},
|
249 |
+
{
|
250 |
+
"emergency_keyword": "Fever",
|
251 |
+
"treatment_keyword": "fluid",
|
252 |
+
"cooccurrence_count": 1330,
|
253 |
+
"percentage": 14.198782961460447
|
254 |
+
},
|
255 |
+
{
|
256 |
+
"emergency_keyword": "Hemorrhage",
|
257 |
+
"treatment_keyword": "treatment",
|
258 |
+
"cooccurrence_count": 1328,
|
259 |
+
"percentage": 14.177431408134941
|
260 |
+
},
|
261 |
+
{
|
262 |
+
"emergency_keyword": "Hypotension",
|
263 |
+
"treatment_keyword": "monitoring",
|
264 |
+
"cooccurrence_count": 1325,
|
265 |
+
"percentage": 14.145404078146683
|
266 |
+
},
|
267 |
+
{
|
268 |
+
"emergency_keyword": "Tachycardia",
|
269 |
+
"treatment_keyword": "Therapy",
|
270 |
+
"cooccurrence_count": 1277,
|
271 |
+
"percentage": 13.632966798334579
|
272 |
+
},
|
273 |
+
{
|
274 |
+
"emergency_keyword": "Dyspnea",
|
275 |
+
"treatment_keyword": "treatment",
|
276 |
+
"cooccurrence_count": 1228,
|
277 |
+
"percentage": 13.10985374185972
|
278 |
+
},
|
279 |
+
{
|
280 |
+
"emergency_keyword": "Myocardial Infarction",
|
281 |
+
"treatment_keyword": "Therapy",
|
282 |
+
"cooccurrence_count": 1215,
|
283 |
+
"percentage": 12.97106864524394
|
284 |
+
}
|
285 |
+
],
|
286 |
"path_b_validation": {
|
287 |
+
"avg_emergency_density": 0.3098621434407273,
|
288 |
+
"avg_treatment_density": 0.6108515041451529,
|
289 |
+
"high_density_records": 1298,
|
290 |
+
"precision_estimate": 0.9995729689334899
|
291 |
},
|
292 |
"condition_mapping_candidates": {}
|
293 |
}
|
dataset/scripts/data_explorer_treatment.py
CHANGED
@@ -9,6 +9,19 @@ import numpy as np
|
|
9 |
from tqdm import tqdm
|
10 |
import re
|
11 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
def analyze_treatment_subset(
|
13 |
treatment_file_path,
|
14 |
emergency_keywords_path,
|
@@ -98,7 +111,8 @@ def analyze_treatment_subset(
|
|
98 |
# Process all emergency keywords
|
99 |
print("\n Processing all emergency keywords...")
|
100 |
for i, keyword in enumerate(tqdm(emergency_keywords, desc="Emergency keywords")):
|
101 |
-
|
|
|
102 |
emergency_matrix[:, i] = df['clean_text_lower'].str.contains(pattern, regex=True, na=False)
|
103 |
matches = emergency_matrix[:, i].sum()
|
104 |
print(f" - {keyword}: {matches} matches")
|
@@ -106,7 +120,8 @@ def analyze_treatment_subset(
|
|
106 |
# Process all treatment keywords
|
107 |
print("\n Processing all treatment keywords...")
|
108 |
for i, keyword in enumerate(tqdm(treatment_keywords, desc="Treatment keywords")):
|
109 |
-
|
|
|
110 |
treatment_matrix[:, i] = df['clean_text_lower'].str.contains(pattern, regex=True, na=False)
|
111 |
matches = treatment_matrix[:, i].sum()
|
112 |
print(f" - {keyword}: {matches} matches")
|
@@ -145,168 +160,89 @@ def analyze_treatment_subset(
|
|
145 |
# Compute keyword density with progress bar
|
146 |
print(" Computing keyword density...")
|
147 |
with tqdm(total=2, desc="Density calculation") as pbar:
|
148 |
-
|
|
|
|
|
|
|
|
|
149 |
pbar.update(1)
|
150 |
-
|
|
|
|
|
|
|
|
|
151 |
pbar.update(1)
|
152 |
|
153 |
-
# Store density in dataframe
|
154 |
df['emergency_keyword_density'] = emergency_density
|
155 |
df['treatment_keyword_density'] = treatment_density
|
156 |
|
157 |
-
# Calculate statistics
|
158 |
stats['path_b_validation'] = {
|
159 |
'avg_emergency_density': float(np.mean(emergency_density)),
|
160 |
'avg_treatment_density': float(np.mean(treatment_density)),
|
161 |
-
'high_density_records': int(sum(
|
162 |
-
|
|
|
|
|
|
|
|
|
|
|
163 |
}
|
164 |
|
165 |
# Print detailed results
|
166 |
print("\n Results:")
|
167 |
-
print(f" - Average emergency keyword density: {stats['path_b_validation']['avg_emergency_density']:.2f}")
|
168 |
-
print(f" - Average treatment keyword density: {stats['path_b_validation']['avg_treatment_density']:.2f}")
|
169 |
-
print(f" - High-density records (
|
170 |
print(f" - Precision estimate: {stats['path_b_validation']['precision_estimate']:.2f}")
|
171 |
|
172 |
# Sample distribution analysis
|
173 |
print("\n Density Distribution:")
|
174 |
density_counts = pd.DataFrame({
|
175 |
-
'emergency': emergency_density,
|
176 |
-
'treatment': treatment_density
|
177 |
}).value_counts().head()
|
178 |
print(" Top 5 density combinations (emergency, treatment):")
|
179 |
for (em, tr), count in density_counts.items():
|
180 |
-
print(f" - {count} documents have {em} emergency and {tr} treatment
|
181 |
-
|
182 |
-
# Condition mapping candidates
|
183 |
-
print("\n8️⃣ Preparing condition mapping candidates...")
|
184 |
-
|
185 |
-
# Group emergency keywords by potential conditions
|
186 |
-
condition_candidates = {}
|
187 |
-
for pair in cooccurrence_pairs[:10]: # Top 10 pairs
|
188 |
-
em_kw = pair['emergency_keyword']
|
189 |
-
tr_kw = pair['treatment_keyword']
|
190 |
-
|
191 |
-
# Simple condition inference (can be enhanced later)
|
192 |
-
if any(cardiac_term in em_kw.lower() for cardiac_term in ['mi', 'cardiac', 'heart', 'chest']):
|
193 |
-
condition = 'cardiac'
|
194 |
-
elif any(resp_term in em_kw.lower() for resp_term in ['respiratory', 'breathing', 'lung', 'dyspnea']):
|
195 |
-
condition = 'respiratory'
|
196 |
-
elif any(neuro_term in em_kw.lower() for neuro_term in ['stroke', 'seizure', 'consciousness']):
|
197 |
-
condition = 'neurological'
|
198 |
-
else:
|
199 |
-
condition = 'general'
|
200 |
-
|
201 |
-
if condition not in condition_candidates:
|
202 |
-
condition_candidates[condition] = []
|
203 |
-
|
204 |
-
condition_candidates[condition].append({
|
205 |
-
'emergency_keyword': em_kw,
|
206 |
-
'treatment_keyword': tr_kw,
|
207 |
-
'strength': pair['cooccurrence_count']
|
208 |
-
})
|
209 |
-
|
210 |
-
stats['condition_mapping_candidates'] = condition_candidates
|
211 |
|
212 |
# Visualization
|
213 |
-
print("\
|
214 |
output_plots = output_dir / "plots"
|
215 |
output_plots.mkdir(parents=True, exist_ok=True)
|
216 |
|
217 |
-
# 1.
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
-
# Treatment keywords
|
232 |
-
tr_counts = list(stats['treatment_keyword_stats'].values())
|
233 |
-
tr_labels = list(stats['treatment_keyword_stats'].keys())
|
234 |
-
ax2.bar(range(len(tr_labels)), tr_counts)
|
235 |
-
ax2.set_title('Treatment Keywords Distribution')
|
236 |
-
ax2.set_xlabel('Treatment Keywords')
|
237 |
-
ax2.set_ylabel('Document Count')
|
238 |
-
ax2.tick_params(axis='x', rotation=45, labelsize=8)
|
239 |
-
ax2.set_xticks(range(len(tr_labels)))
|
240 |
-
ax2.set_xticklabels(tr_labels, ha='right')
|
241 |
-
|
242 |
-
plt.tight_layout()
|
243 |
-
plt.savefig(output_plots / "dual_keyword_distribution.png", bbox_inches='tight', dpi=300)
|
244 |
-
plt.close()
|
245 |
-
|
246 |
-
# 2. Co-occurrence heatmap (top pairs)
|
247 |
-
if len(cooccurrence_pairs) > 0:
|
248 |
-
top_pairs = cooccurrence_pairs[:15] # Top 15 for readability
|
249 |
-
cooc_matrix = np.zeros((len(set([p['emergency_keyword'] for p in top_pairs])),
|
250 |
-
len(set([p['treatment_keyword'] for p in top_pairs]))))
|
251 |
-
|
252 |
-
em_unique = list(set([p['emergency_keyword'] for p in top_pairs]))
|
253 |
-
tr_unique = list(set([p['treatment_keyword'] for p in top_pairs]))
|
254 |
-
|
255 |
-
for pair in top_pairs:
|
256 |
-
i = em_unique.index(pair['emergency_keyword'])
|
257 |
-
j = tr_unique.index(pair['treatment_keyword'])
|
258 |
-
cooc_matrix[i, j] = pair['cooccurrence_count']
|
259 |
-
|
260 |
-
plt.figure(figsize=(12, 8))
|
261 |
-
sns.heatmap(cooc_matrix,
|
262 |
-
xticklabels=tr_unique,
|
263 |
-
yticklabels=em_unique,
|
264 |
-
annot=True,
|
265 |
-
fmt='g',
|
266 |
-
cmap='YlOrRd')
|
267 |
-
plt.title('Emergency-Treatment Keywords Co-occurrence Heatmap')
|
268 |
-
plt.xlabel('Treatment Keywords')
|
269 |
-
plt.ylabel('Emergency Keywords')
|
270 |
-
plt.xticks(rotation=45, ha='right')
|
271 |
-
plt.yticks(rotation=0)
|
272 |
-
plt.tight_layout()
|
273 |
-
plt.savefig(output_plots / "cooccurrence_heatmap.png", bbox_inches='tight', dpi=300)
|
274 |
-
plt.close()
|
275 |
|
276 |
-
#
|
277 |
-
plt.
|
278 |
-
|
279 |
-
plt.title('Text Length Distribution in Treatment Subset')
|
280 |
-
plt.xlabel('Text Length (characters)')
|
281 |
-
plt.ylabel('Frequency')
|
282 |
-
plt.axvline(avg_length, color='red', linestyle='--', label=f'Average: {avg_length:.0f}')
|
283 |
plt.legend()
|
284 |
-
plt.savefig(output_plots / "text_length_distribution.png", bbox_inches='tight')
|
285 |
-
plt.close()
|
286 |
|
287 |
-
|
288 |
-
plt.figure(figsize=(10, 8))
|
289 |
-
plt.scatter(df['emergency_keyword_density'], df['treatment_keyword_density'], alpha=0.6)
|
290 |
-
plt.xlabel('Emergency Keyword Density')
|
291 |
-
plt.ylabel('Treatment Keyword Density')
|
292 |
-
plt.title('Emergency vs Treatment Keyword Density')
|
293 |
-
plt.grid(True, alpha=0.3)
|
294 |
-
plt.savefig(output_plots / "keyword_density_scatter.png", bbox_inches='tight')
|
295 |
plt.close()
|
296 |
|
297 |
# Save comprehensive statistics
|
298 |
-
print("\
|
299 |
stats_dir = output_dir / "stats"
|
300 |
stats_dir.mkdir(parents=True, exist_ok=True)
|
301 |
|
302 |
with open(stats_dir / "treatment_analysis_comprehensive.json", 'w', encoding='utf-8') as f:
|
303 |
json.dump(stats, f, indent=2, ensure_ascii=False)
|
304 |
|
305 |
-
# Save co-occurrence pairs as CSV for easy review
|
306 |
-
if cooccurrence_pairs:
|
307 |
-
cooc_df = pd.DataFrame(cooccurrence_pairs)
|
308 |
-
cooc_df.to_csv(stats_dir / "cooccurrence_pairs.csv", index=False)
|
309 |
-
|
310 |
print(f"✅ Treatment subset analysis complete!")
|
311 |
print(f" Results saved to: {output_dir}")
|
312 |
print(f" Plots: {output_plots}")
|
|
|
9 |
from tqdm import tqdm
|
10 |
import re
|
11 |
|
12 |
+
def calculate_density(matches, text_length):
|
13 |
+
"""
|
14 |
+
Calculate keyword density per 1000 words
|
15 |
+
|
16 |
+
Args:
|
17 |
+
matches: Number of keyword matches
|
18 |
+
text_length: Total text length
|
19 |
+
|
20 |
+
Returns:
|
21 |
+
float: Density per 1000 words
|
22 |
+
"""
|
23 |
+
return (matches / text_length) * 1000
|
24 |
+
|
25 |
def analyze_treatment_subset(
|
26 |
treatment_file_path,
|
27 |
emergency_keywords_path,
|
|
|
111 |
# Process all emergency keywords
|
112 |
print("\n Processing all emergency keywords...")
|
113 |
for i, keyword in enumerate(tqdm(emergency_keywords, desc="Emergency keywords")):
|
114 |
+
# Using word boundary instead of negative lookbehind/lookahead
|
115 |
+
pattern = r'\b' + re.escape(keyword.lower()) + r'\b'
|
116 |
emergency_matrix[:, i] = df['clean_text_lower'].str.contains(pattern, regex=True, na=False)
|
117 |
matches = emergency_matrix[:, i].sum()
|
118 |
print(f" - {keyword}: {matches} matches")
|
|
|
120 |
# Process all treatment keywords
|
121 |
print("\n Processing all treatment keywords...")
|
122 |
for i, keyword in enumerate(tqdm(treatment_keywords, desc="Treatment keywords")):
|
123 |
+
# Using word boundary instead of negative lookbehind/lookahead
|
124 |
+
pattern = r'\b' + re.escape(keyword.lower()) + r'\b'
|
125 |
treatment_matrix[:, i] = df['clean_text_lower'].str.contains(pattern, regex=True, na=False)
|
126 |
matches = treatment_matrix[:, i].sum()
|
127 |
print(f" - {keyword}: {matches} matches")
|
|
|
160 |
# Compute keyword density with progress bar
|
161 |
print(" Computing keyword density...")
|
162 |
with tqdm(total=2, desc="Density calculation") as pbar:
|
163 |
+
# Calculate density per 1000 words for both emergency and treatment keywords
|
164 |
+
emergency_density = calculate_density(
|
165 |
+
emergency_matrix.sum(axis=1),
|
166 |
+
df['text_length']
|
167 |
+
)
|
168 |
pbar.update(1)
|
169 |
+
|
170 |
+
treatment_density = calculate_density(
|
171 |
+
treatment_matrix.sum(axis=1),
|
172 |
+
df['text_length']
|
173 |
+
)
|
174 |
pbar.update(1)
|
175 |
|
176 |
+
# Store density in dataframe for visualization
|
177 |
df['emergency_keyword_density'] = emergency_density
|
178 |
df['treatment_keyword_density'] = treatment_density
|
179 |
|
180 |
+
# Calculate statistics with the new density metrics
|
181 |
stats['path_b_validation'] = {
|
182 |
'avg_emergency_density': float(np.mean(emergency_density)),
|
183 |
'avg_treatment_density': float(np.mean(treatment_density)),
|
184 |
+
'high_density_records': int(sum(
|
185 |
+
(emergency_density >= np.percentile(emergency_density, 75)) &
|
186 |
+
(treatment_density >= np.percentile(treatment_density, 75))
|
187 |
+
)),
|
188 |
+
'precision_estimate': float(sum(
|
189 |
+
(emergency_density > 0) & (treatment_density > 0)
|
190 |
+
) / len(df))
|
191 |
}
|
192 |
|
193 |
# Print detailed results
|
194 |
print("\n Results:")
|
195 |
+
print(f" - Average emergency keyword density (per 1000 words): {stats['path_b_validation']['avg_emergency_density']:.2f}")
|
196 |
+
print(f" - Average treatment keyword density (per 1000 words): {stats['path_b_validation']['avg_treatment_density']:.2f}")
|
197 |
+
print(f" - High-density records (top 25% in both): {stats['path_b_validation']['high_density_records']}")
|
198 |
print(f" - Precision estimate: {stats['path_b_validation']['precision_estimate']:.2f}")
|
199 |
|
200 |
# Sample distribution analysis
|
201 |
print("\n Density Distribution:")
|
202 |
density_counts = pd.DataFrame({
|
203 |
+
'emergency': pd.qcut(emergency_density, q=4, labels=['Low', 'Medium-Low', 'Medium-High', 'High']),
|
204 |
+
'treatment': pd.qcut(treatment_density, q=4, labels=['Low', 'Medium-Low', 'Medium-High', 'High'])
|
205 |
}).value_counts().head()
|
206 |
print(" Top 5 density combinations (emergency, treatment):")
|
207 |
for (em, tr), count in density_counts.items():
|
208 |
+
print(f" - {count} documents have {em} emergency and {tr} treatment density")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
209 |
|
210 |
# Visualization
|
211 |
+
print("\n8️⃣ Generating visualizations...")
|
212 |
output_plots = output_dir / "plots"
|
213 |
output_plots.mkdir(parents=True, exist_ok=True)
|
214 |
|
215 |
+
# 1. Keyword density scatter plot with improved visualization
|
216 |
+
plt.figure(figsize=(12, 8))
|
217 |
+
plt.scatter(
|
218 |
+
emergency_density,
|
219 |
+
treatment_density,
|
220 |
+
alpha=0.6,
|
221 |
+
c=np.log1p(df['text_length']), # Color by log text length
|
222 |
+
cmap='viridis'
|
223 |
+
)
|
224 |
+
plt.colorbar(label='Log Text Length')
|
225 |
+
plt.xlabel('Emergency Keyword Density (per 1000 words)')
|
226 |
+
plt.ylabel('Treatment Keyword Density (per 1000 words)')
|
227 |
+
plt.title('Emergency vs Treatment Keyword Density')
|
228 |
+
plt.grid(True, alpha=0.3)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
229 |
|
230 |
+
# Add mean lines
|
231 |
+
plt.axvline(x=np.mean(emergency_density), color='r', linestyle='--', alpha=0.5, label='Mean Emergency Density')
|
232 |
+
plt.axhline(y=np.mean(treatment_density), color='g', linestyle='--', alpha=0.5, label='Mean Treatment Density')
|
|
|
|
|
|
|
|
|
233 |
plt.legend()
|
|
|
|
|
234 |
|
235 |
+
plt.savefig(output_plots / "keyword_density_scatter.png", bbox_inches='tight', dpi=300)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
236 |
plt.close()
|
237 |
|
238 |
# Save comprehensive statistics
|
239 |
+
print("\n9️⃣ Saving analysis results...")
|
240 |
stats_dir = output_dir / "stats"
|
241 |
stats_dir.mkdir(parents=True, exist_ok=True)
|
242 |
|
243 |
with open(stats_dir / "treatment_analysis_comprehensive.json", 'w', encoding='utf-8') as f:
|
244 |
json.dump(stats, f, indent=2, ensure_ascii=False)
|
245 |
|
|
|
|
|
|
|
|
|
|
|
246 |
print(f"✅ Treatment subset analysis complete!")
|
247 |
print(f" Results saved to: {output_dir}")
|
248 |
print(f" Plots: {output_plots}")
|