YanBoChen commited on
Commit
654aa66
·
1 Parent(s): 04a03be

feat: update treatment analysis with keyword density calculations and enhanced visualization(test previous 2 dataset, especially treatment_subset)

Browse files
dataset/analysis_treatment/stats/treatment_analysis_comprehensive.json CHANGED
@@ -3,7 +3,7 @@
3
  "total_records": 9367,
4
  "avg_text_length": 27179.22952919825,
5
  "emergency_keywords_count": 47,
6
- "treatment_keywords_count": 118
7
  },
8
  "emergency_keyword_stats": {
9
  "Acute abdomen": 51,
@@ -55,122 +55,239 @@
55
  "Ventricular fibrillation": 280
56
  },
57
  "treatment_keyword_stats": {
58
- "iv fluids": 75,
59
- "Infusion Intravenous": 3,
60
- "fluid resuscitation": 115,
61
- "normal saline": 252,
62
- "crystalloids": 45,
63
- "vasopressors": 188,
64
- "Vasoconstrictor Agents": 2,
65
- "Epinephrine": 806,
66
- "Ondansetron": 43,
67
- "Ibuprofen": 269,
68
- "Morphine": 289,
69
- "Lidocaine": 212,
70
  "Airway Management": 174,
71
- "intubation": 493,
72
- "Intubation Intratracheal": 0,
73
- "ventilation support": 14,
74
- "Ventilators": 86,
75
- "oxygen therapy": 178,
76
- "Oxygen Inhalation Therapy": 2,
77
- "cpap": 84,
78
- "Continuous Positive Airway Pressure": 84,
79
- "bipap": 25,
80
  "Bi-level Positive Airway Pressure": 6,
81
- "Nebulization": 41,
82
- "cpr": 151,
 
83
  "Cardiopulmonary Resuscitation": 131,
84
- "ACLS": 30,
85
- "Advanced Cardiac Life Support": 34,
86
- "Defibrillation": 96,
87
  "Cardioversion": 142,
88
- "Blood Transfusion": 379,
89
- "transfusion": 826,
90
- "hemodynamic monitoring": 43,
91
- "Hemodynamics": 135,
92
- "central line placement": 6,
93
- "Catheterization Central Venous": 0,
94
- "arterial line placement": 0,
95
  "Catheterization Arterial": 0,
96
- "Hemostasis": 180,
97
- "wound care": 73,
98
- "Wound Management": 37,
99
- "Suturing": 53,
100
- "Tourniquet": 56,
101
  "compression dressing": 2,
102
- "Wound Dressing": 30,
103
- "splinting": 26,
104
- "Splints": 29,
105
- "radiologic imaging": 5,
106
- "Radiography": 218,
107
- "point-of-care ultrasound": 13,
108
- "POCUS": 10,
109
- "Ultrasonography Point-of-Care": 0,
110
- "x-ray": 1293,
111
  "ct scan": 1036,
112
- "Tomography X-Ray Computed": 0,
113
- "laboratory testing": 296,
114
- "Laboratory Techniques": 29,
115
- "Sedation": 602,
116
- "analgesia": 323,
117
- "Analgesia": 323,
118
- "procedural sedation": 26,
119
- "Anesthesia Procedural": 0,
120
- "ketamine": 86,
121
- "Ketamine": 86,
122
- "midazolam": 204,
123
- "Midazolam": 204,
124
- "supportive care": 564,
125
- "Supportive Care": 564,
126
- "monitoring": 3593,
127
- "Patient Monitoring": 107,
128
- "vital signs monitoring": 1,
129
- "Vital Signs": 459,
130
  "icu transfer": 9,
131
- "Intensive Care Units": 155,
132
- "treatment": 7719,
133
- "Therapeutics": 182,
 
 
 
 
 
 
 
134
  "manage": 4416,
135
- "Patient Management": 281,
136
  "management": 4008,
137
- "intervention": 2695,
138
- "Therapeutic Intervention": 181,
139
- "Therapy": 6117,
140
  "medication": 4698,
141
- "Drug Therapy": 773,
142
- "procedure": 3073,
143
- "Surgical Procedures Operative": 0,
144
- "resuscitation": 539,
145
- "administer": 3881,
146
- "Drug Administration Routes": 0,
147
- "dose": 5344,
148
- "Dosage Forms": 210,
149
  "monitor": 4521,
150
- "Oxygen": 1779,
151
- "fluid": 2938,
152
- "surgery": 3531,
153
- "Surgical Procedures": 482,
154
- "antibiotic": 1922,
155
- "Anti-Bacterial Agents": 1,
156
- "Dopamine": 389,
157
- "Amiodarone": 315,
158
- "levophed": 11,
159
- "Norepinephrine": 392,
160
- "Bosmin": 0,
161
- "Adrenaline": 135,
162
- "Insulin": 808,
163
  "nitroglycerin": 125,
164
  "NTG": 81,
165
- "beta blocker": 297,
166
- "alpha blocker": 35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
167
  },
168
- "cooccurrence_analysis": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
169
  "path_b_validation": {
170
- "avg_emergency_density": 0.0,
171
- "avg_treatment_density": 4.9375467065229,
172
- "high_density_records": 0,
173
- "precision_estimate": 0.0
174
  },
175
  "condition_mapping_candidates": {}
176
  }
 
3
  "total_records": 9367,
4
  "avg_text_length": 27179.22952919825,
5
  "emergency_keywords_count": 47,
6
+ "treatment_keywords_count": 105
7
  },
8
  "emergency_keyword_stats": {
9
  "Acute abdomen": 51,
 
55
  "Ventricular fibrillation": 280
56
  },
57
  "treatment_keyword_stats": {
58
+ "ACLS": 30,
59
+ "administer": 3881,
60
+ "Adrenaline": 135,
61
+ "Advanced Cardiac Life Support": 34,
 
 
 
 
 
 
 
 
62
  "Airway Management": 174,
63
+ "alpha blocker": 35,
64
+ "Amiodarone": 315,
65
+ "analgesia": 323,
66
+ "Anesthesia Procedural": 0,
67
+ "Anti-Bacterial Agents": 1,
68
+ "antibiotic": 1922,
69
+ "arterial line placement": 0,
70
+ "beta blocker": 297,
 
71
  "Bi-level Positive Airway Pressure": 6,
72
+ "bipap": 25,
73
+ "Blood Transfusion": 379,
74
+ "Bosmin": 0,
75
  "Cardiopulmonary Resuscitation": 131,
 
 
 
76
  "Cardioversion": 142,
 
 
 
 
 
 
 
77
  "Catheterization Arterial": 0,
78
+ "Catheterization Central Venous": 0,
79
+ "central line placement": 6,
 
 
 
80
  "compression dressing": 2,
81
+ "Computed Tomography": 518,
82
+ "cpap": 84,
83
+ "cpr": 151,
84
+ "crystalloids": 45,
 
 
 
 
 
85
  "ct scan": 1036,
86
+ "Defibrillation": 96,
87
+ "Dopamine": 389,
88
+ "Dosage Forms": 210,
89
+ "dose": 5344,
90
+ "Drug Administration Routes": 0,
91
+ "Drug Therapy": 773,
92
+ "Epinephrine": 806,
93
+ "fluid": 2938,
94
+ "fluid resuscitation": 115,
95
+ "hemodynamic monitoring": 43,
96
+ "Hemodynamics": 135,
97
+ "Hemostasis": 180,
98
+ "Ibuprofen": 269,
 
 
 
 
 
99
  "icu transfer": 9,
100
+ "Insulin": 808,
101
+ "intervention": 2695,
102
+ "intubation": 493,
103
+ "Intratracheal Intubation": 3,
104
+ "Intravenous Infusion": 576,
105
+ "iv fluids": 75,
106
+ "laboratory techniques": 29,
107
+ "laboratory testing": 296,
108
+ "levophed": 11,
109
+ "Lidocaine": 212,
110
  "manage": 4416,
 
111
  "management": 4008,
 
 
 
112
  "medication": 4698,
113
+ "midazolam": 204,
 
 
 
 
 
 
 
114
  "monitor": 4521,
115
+ "monitoring": 3593,
116
+ "Morphine": 289,
117
+ "Nebulization": 41,
 
 
 
 
 
 
 
 
 
 
118
  "nitroglycerin": 125,
119
  "NTG": 81,
120
+ "Norepinephrine": 392,
121
+ "normal saline": 252,
122
+ "Ondansetron": 43,
123
+ "Oxygen": 1779,
124
+ "Oxygen Inhalation Therapy": 2,
125
+ "oxygen therapy": 178,
126
+ "Patient Management": 281,
127
+ "Patient Monitoring": 107,
128
+ "POCUS": 10,
129
+ "point of care ultrasound": 2,
130
+ "procedural sedation": 26,
131
+ "procedure": 3073,
132
+ "radiologic imaging": 5,
133
+ "Radiography": 218,
134
+ "resuscitation": 539,
135
+ "Sedation": 602,
136
+ "splinting": 26,
137
+ "Splints": 29,
138
+ "supportive care": 564,
139
+ "surgical procedures": 482,
140
+ "Surgical Procedures Operative": 0,
141
+ "surgery": 3531,
142
+ "Suture": 179,
143
+ "Suturing": 53,
144
+ "Therapeutic Intervention": 181,
145
+ "Therapeutics": 182,
146
+ "Therapy": 6117,
147
+ "tourniquet": 56,
148
+ "transfusion": 826,
149
+ "treat": 8270,
150
+ "treatment": 7719,
151
+ "Ultrasonography Point of Care": 0,
152
+ "ultrasound": 1273,
153
+ "Vasoconstrictor Agents": 2,
154
+ "vasopressors": 188,
155
+ "ventilation support": 14,
156
+ "Ventilators": 86,
157
+ "Vital Signs": 459,
158
+ "vital signs monitoring": 1,
159
+ "wound care": 73,
160
+ "Wound Dressing": 30,
161
+ "Wound Management": 37,
162
+ "X-Ray": 1293
163
  },
164
+ "cooccurrence_analysis": [
165
+ {
166
+ "emergency_keyword": "Fever",
167
+ "treatment_keyword": "treatment",
168
+ "cooccurrence_count": 3488,
169
+ "percentage": 37.23710899967973
170
+ },
171
+ {
172
+ "emergency_keyword": "Fever",
173
+ "treatment_keyword": "Therapy",
174
+ "cooccurrence_count": 2698,
175
+ "percentage": 28.803245436105477
176
+ },
177
+ {
178
+ "emergency_keyword": "Fever",
179
+ "treatment_keyword": "dose",
180
+ "cooccurrence_count": 2430,
181
+ "percentage": 25.94213729048788
182
+ },
183
+ {
184
+ "emergency_keyword": "Fever",
185
+ "treatment_keyword": "medication",
186
+ "cooccurrence_count": 1979,
187
+ "percentage": 21.127362015586634
188
+ },
189
+ {
190
+ "emergency_keyword": "Hypotension",
191
+ "treatment_keyword": "treatment",
192
+ "cooccurrence_count": 1760,
193
+ "percentage": 18.789366926443897
194
+ },
195
+ {
196
+ "emergency_keyword": "Fever",
197
+ "treatment_keyword": "management",
198
+ "cooccurrence_count": 1753,
199
+ "percentage": 18.714636489804633
200
+ },
201
+ {
202
+ "emergency_keyword": "Fever",
203
+ "treatment_keyword": "treat",
204
+ "cooccurrence_count": 1744,
205
+ "percentage": 18.618554499839863
206
+ },
207
+ {
208
+ "emergency_keyword": "Fever",
209
+ "treatment_keyword": "monitoring",
210
+ "cooccurrence_count": 1674,
211
+ "percentage": 17.87125013344721
212
+ },
213
+ {
214
+ "emergency_keyword": "Hypotension",
215
+ "treatment_keyword": "Therapy",
216
+ "cooccurrence_count": 1558,
217
+ "percentage": 16.63286004056795
218
+ },
219
+ {
220
+ "emergency_keyword": "Fever",
221
+ "treatment_keyword": "surgery",
222
+ "cooccurrence_count": 1505,
223
+ "percentage": 16.06704387744208
224
+ },
225
+ {
226
+ "emergency_keyword": "Tachycardia",
227
+ "treatment_keyword": "treatment",
228
+ "cooccurrence_count": 1441,
229
+ "percentage": 15.383794171025942
230
+ },
231
+ {
232
+ "emergency_keyword": "Hypotension",
233
+ "treatment_keyword": "dose",
234
+ "cooccurrence_count": 1423,
235
+ "percentage": 15.191630191096403
236
+ },
237
+ {
238
+ "emergency_keyword": "Myocardial Infarction",
239
+ "treatment_keyword": "treatment",
240
+ "cooccurrence_count": 1369,
241
+ "percentage": 14.615138251307783
242
+ },
243
+ {
244
+ "emergency_keyword": "Shock",
245
+ "treatment_keyword": "treatment",
246
+ "cooccurrence_count": 1340,
247
+ "percentage": 14.305540728087967
248
+ },
249
+ {
250
+ "emergency_keyword": "Fever",
251
+ "treatment_keyword": "fluid",
252
+ "cooccurrence_count": 1330,
253
+ "percentage": 14.198782961460447
254
+ },
255
+ {
256
+ "emergency_keyword": "Hemorrhage",
257
+ "treatment_keyword": "treatment",
258
+ "cooccurrence_count": 1328,
259
+ "percentage": 14.177431408134941
260
+ },
261
+ {
262
+ "emergency_keyword": "Hypotension",
263
+ "treatment_keyword": "monitoring",
264
+ "cooccurrence_count": 1325,
265
+ "percentage": 14.145404078146683
266
+ },
267
+ {
268
+ "emergency_keyword": "Tachycardia",
269
+ "treatment_keyword": "Therapy",
270
+ "cooccurrence_count": 1277,
271
+ "percentage": 13.632966798334579
272
+ },
273
+ {
274
+ "emergency_keyword": "Dyspnea",
275
+ "treatment_keyword": "treatment",
276
+ "cooccurrence_count": 1228,
277
+ "percentage": 13.10985374185972
278
+ },
279
+ {
280
+ "emergency_keyword": "Myocardial Infarction",
281
+ "treatment_keyword": "Therapy",
282
+ "cooccurrence_count": 1215,
283
+ "percentage": 12.97106864524394
284
+ }
285
+ ],
286
  "path_b_validation": {
287
+ "avg_emergency_density": 0.3098621434407273,
288
+ "avg_treatment_density": 0.6108515041451529,
289
+ "high_density_records": 1298,
290
+ "precision_estimate": 0.9995729689334899
291
  },
292
  "condition_mapping_candidates": {}
293
  }
dataset/scripts/data_explorer_treatment.py CHANGED
@@ -9,6 +9,19 @@ import numpy as np
9
  from tqdm import tqdm
10
  import re
11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  def analyze_treatment_subset(
13
  treatment_file_path,
14
  emergency_keywords_path,
@@ -98,7 +111,8 @@ def analyze_treatment_subset(
98
  # Process all emergency keywords
99
  print("\n Processing all emergency keywords...")
100
  for i, keyword in enumerate(tqdm(emergency_keywords, desc="Emergency keywords")):
101
- pattern = r'(?<!\w)' + re.escape(keyword) + r'(?!\w)'
 
102
  emergency_matrix[:, i] = df['clean_text_lower'].str.contains(pattern, regex=True, na=False)
103
  matches = emergency_matrix[:, i].sum()
104
  print(f" - {keyword}: {matches} matches")
@@ -106,7 +120,8 @@ def analyze_treatment_subset(
106
  # Process all treatment keywords
107
  print("\n Processing all treatment keywords...")
108
  for i, keyword in enumerate(tqdm(treatment_keywords, desc="Treatment keywords")):
109
- pattern = r'(?<!\w)' + re.escape(keyword) + r'(?!\w)'
 
110
  treatment_matrix[:, i] = df['clean_text_lower'].str.contains(pattern, regex=True, na=False)
111
  matches = treatment_matrix[:, i].sum()
112
  print(f" - {keyword}: {matches} matches")
@@ -145,168 +160,89 @@ def analyze_treatment_subset(
145
  # Compute keyword density with progress bar
146
  print(" Computing keyword density...")
147
  with tqdm(total=2, desc="Density calculation") as pbar:
148
- emergency_density = emergency_matrix.sum(axis=1)
 
 
 
 
149
  pbar.update(1)
150
- treatment_density = treatment_matrix.sum(axis=1)
 
 
 
 
151
  pbar.update(1)
152
 
153
- # Store density in dataframe
154
  df['emergency_keyword_density'] = emergency_density
155
  df['treatment_keyword_density'] = treatment_density
156
 
157
- # Calculate statistics
158
  stats['path_b_validation'] = {
159
  'avg_emergency_density': float(np.mean(emergency_density)),
160
  'avg_treatment_density': float(np.mean(treatment_density)),
161
- 'high_density_records': int(sum((emergency_density >= 2) & (treatment_density >= 2))),
162
- 'precision_estimate': float(sum((emergency_density >= 1) & (treatment_density >= 1)) / len(df))
 
 
 
 
 
163
  }
164
 
165
  # Print detailed results
166
  print("\n Results:")
167
- print(f" - Average emergency keyword density: {stats['path_b_validation']['avg_emergency_density']:.2f}")
168
- print(f" - Average treatment keyword density: {stats['path_b_validation']['avg_treatment_density']:.2f}")
169
- print(f" - High-density records (≥2 each): {stats['path_b_validation']['high_density_records']}")
170
  print(f" - Precision estimate: {stats['path_b_validation']['precision_estimate']:.2f}")
171
 
172
  # Sample distribution analysis
173
  print("\n Density Distribution:")
174
  density_counts = pd.DataFrame({
175
- 'emergency': emergency_density,
176
- 'treatment': treatment_density
177
  }).value_counts().head()
178
  print(" Top 5 density combinations (emergency, treatment):")
179
  for (em, tr), count in density_counts.items():
180
- print(f" - {count} documents have {em} emergency and {tr} treatment keywords")
181
-
182
- # Condition mapping candidates
183
- print("\n8️⃣ Preparing condition mapping candidates...")
184
-
185
- # Group emergency keywords by potential conditions
186
- condition_candidates = {}
187
- for pair in cooccurrence_pairs[:10]: # Top 10 pairs
188
- em_kw = pair['emergency_keyword']
189
- tr_kw = pair['treatment_keyword']
190
-
191
- # Simple condition inference (can be enhanced later)
192
- if any(cardiac_term in em_kw.lower() for cardiac_term in ['mi', 'cardiac', 'heart', 'chest']):
193
- condition = 'cardiac'
194
- elif any(resp_term in em_kw.lower() for resp_term in ['respiratory', 'breathing', 'lung', 'dyspnea']):
195
- condition = 'respiratory'
196
- elif any(neuro_term in em_kw.lower() for neuro_term in ['stroke', 'seizure', 'consciousness']):
197
- condition = 'neurological'
198
- else:
199
- condition = 'general'
200
-
201
- if condition not in condition_candidates:
202
- condition_candidates[condition] = []
203
-
204
- condition_candidates[condition].append({
205
- 'emergency_keyword': em_kw,
206
- 'treatment_keyword': tr_kw,
207
- 'strength': pair['cooccurrence_count']
208
- })
209
-
210
- stats['condition_mapping_candidates'] = condition_candidates
211
 
212
  # Visualization
213
- print("\n9️⃣ Generating visualizations...")
214
  output_plots = output_dir / "plots"
215
  output_plots.mkdir(parents=True, exist_ok=True)
216
 
217
- # 1. Dual keyword distribution
218
- fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 8))
219
-
220
- # Emergency keywords in treatment subset
221
- em_counts = list(stats['emergency_keyword_stats'].values())
222
- em_labels = list(stats['emergency_keyword_stats'].keys())
223
- ax1.bar(range(len(em_labels)), em_counts)
224
- ax1.set_title('Emergency Keywords in Treatment Subset')
225
- ax1.set_xlabel('Emergency Keywords')
226
- ax1.set_ylabel('Document Count')
227
- ax1.tick_params(axis='x', rotation=45, labelsize=8)
228
- ax1.set_xticks(range(len(em_labels)))
229
- ax1.set_xticklabels(em_labels, ha='right')
230
-
231
- # Treatment keywords
232
- tr_counts = list(stats['treatment_keyword_stats'].values())
233
- tr_labels = list(stats['treatment_keyword_stats'].keys())
234
- ax2.bar(range(len(tr_labels)), tr_counts)
235
- ax2.set_title('Treatment Keywords Distribution')
236
- ax2.set_xlabel('Treatment Keywords')
237
- ax2.set_ylabel('Document Count')
238
- ax2.tick_params(axis='x', rotation=45, labelsize=8)
239
- ax2.set_xticks(range(len(tr_labels)))
240
- ax2.set_xticklabels(tr_labels, ha='right')
241
-
242
- plt.tight_layout()
243
- plt.savefig(output_plots / "dual_keyword_distribution.png", bbox_inches='tight', dpi=300)
244
- plt.close()
245
-
246
- # 2. Co-occurrence heatmap (top pairs)
247
- if len(cooccurrence_pairs) > 0:
248
- top_pairs = cooccurrence_pairs[:15] # Top 15 for readability
249
- cooc_matrix = np.zeros((len(set([p['emergency_keyword'] for p in top_pairs])),
250
- len(set([p['treatment_keyword'] for p in top_pairs]))))
251
-
252
- em_unique = list(set([p['emergency_keyword'] for p in top_pairs]))
253
- tr_unique = list(set([p['treatment_keyword'] for p in top_pairs]))
254
-
255
- for pair in top_pairs:
256
- i = em_unique.index(pair['emergency_keyword'])
257
- j = tr_unique.index(pair['treatment_keyword'])
258
- cooc_matrix[i, j] = pair['cooccurrence_count']
259
-
260
- plt.figure(figsize=(12, 8))
261
- sns.heatmap(cooc_matrix,
262
- xticklabels=tr_unique,
263
- yticklabels=em_unique,
264
- annot=True,
265
- fmt='g',
266
- cmap='YlOrRd')
267
- plt.title('Emergency-Treatment Keywords Co-occurrence Heatmap')
268
- plt.xlabel('Treatment Keywords')
269
- plt.ylabel('Emergency Keywords')
270
- plt.xticks(rotation=45, ha='right')
271
- plt.yticks(rotation=0)
272
- plt.tight_layout()
273
- plt.savefig(output_plots / "cooccurrence_heatmap.png", bbox_inches='tight', dpi=300)
274
- plt.close()
275
 
276
- # 3. Text length distribution
277
- plt.figure(figsize=(10, 6))
278
- df['text_length'].hist(bins=50, alpha=0.7)
279
- plt.title('Text Length Distribution in Treatment Subset')
280
- plt.xlabel('Text Length (characters)')
281
- plt.ylabel('Frequency')
282
- plt.axvline(avg_length, color='red', linestyle='--', label=f'Average: {avg_length:.0f}')
283
  plt.legend()
284
- plt.savefig(output_plots / "text_length_distribution.png", bbox_inches='tight')
285
- plt.close()
286
 
287
- # 4. Keyword density scatter plot
288
- plt.figure(figsize=(10, 8))
289
- plt.scatter(df['emergency_keyword_density'], df['treatment_keyword_density'], alpha=0.6)
290
- plt.xlabel('Emergency Keyword Density')
291
- plt.ylabel('Treatment Keyword Density')
292
- plt.title('Emergency vs Treatment Keyword Density')
293
- plt.grid(True, alpha=0.3)
294
- plt.savefig(output_plots / "keyword_density_scatter.png", bbox_inches='tight')
295
  plt.close()
296
 
297
  # Save comprehensive statistics
298
- print("\n🔟 Saving analysis results...")
299
  stats_dir = output_dir / "stats"
300
  stats_dir.mkdir(parents=True, exist_ok=True)
301
 
302
  with open(stats_dir / "treatment_analysis_comprehensive.json", 'w', encoding='utf-8') as f:
303
  json.dump(stats, f, indent=2, ensure_ascii=False)
304
 
305
- # Save co-occurrence pairs as CSV for easy review
306
- if cooccurrence_pairs:
307
- cooc_df = pd.DataFrame(cooccurrence_pairs)
308
- cooc_df.to_csv(stats_dir / "cooccurrence_pairs.csv", index=False)
309
-
310
  print(f"✅ Treatment subset analysis complete!")
311
  print(f" Results saved to: {output_dir}")
312
  print(f" Plots: {output_plots}")
 
9
  from tqdm import tqdm
10
  import re
11
 
12
+ def calculate_density(matches, text_length):
13
+ """
14
+ Calculate keyword density per 1000 words
15
+
16
+ Args:
17
+ matches: Number of keyword matches
18
+ text_length: Total text length
19
+
20
+ Returns:
21
+ float: Density per 1000 words
22
+ """
23
+ return (matches / text_length) * 1000
24
+
25
  def analyze_treatment_subset(
26
  treatment_file_path,
27
  emergency_keywords_path,
 
111
  # Process all emergency keywords
112
  print("\n Processing all emergency keywords...")
113
  for i, keyword in enumerate(tqdm(emergency_keywords, desc="Emergency keywords")):
114
+ # Using word boundary instead of negative lookbehind/lookahead
115
+ pattern = r'\b' + re.escape(keyword.lower()) + r'\b'
116
  emergency_matrix[:, i] = df['clean_text_lower'].str.contains(pattern, regex=True, na=False)
117
  matches = emergency_matrix[:, i].sum()
118
  print(f" - {keyword}: {matches} matches")
 
120
  # Process all treatment keywords
121
  print("\n Processing all treatment keywords...")
122
  for i, keyword in enumerate(tqdm(treatment_keywords, desc="Treatment keywords")):
123
+ # Using word boundary instead of negative lookbehind/lookahead
124
+ pattern = r'\b' + re.escape(keyword.lower()) + r'\b'
125
  treatment_matrix[:, i] = df['clean_text_lower'].str.contains(pattern, regex=True, na=False)
126
  matches = treatment_matrix[:, i].sum()
127
  print(f" - {keyword}: {matches} matches")
 
160
  # Compute keyword density with progress bar
161
  print(" Computing keyword density...")
162
  with tqdm(total=2, desc="Density calculation") as pbar:
163
+ # Calculate density per 1000 words for both emergency and treatment keywords
164
+ emergency_density = calculate_density(
165
+ emergency_matrix.sum(axis=1),
166
+ df['text_length']
167
+ )
168
  pbar.update(1)
169
+
170
+ treatment_density = calculate_density(
171
+ treatment_matrix.sum(axis=1),
172
+ df['text_length']
173
+ )
174
  pbar.update(1)
175
 
176
+ # Store density in dataframe for visualization
177
  df['emergency_keyword_density'] = emergency_density
178
  df['treatment_keyword_density'] = treatment_density
179
 
180
+ # Calculate statistics with the new density metrics
181
  stats['path_b_validation'] = {
182
  'avg_emergency_density': float(np.mean(emergency_density)),
183
  'avg_treatment_density': float(np.mean(treatment_density)),
184
+ 'high_density_records': int(sum(
185
+ (emergency_density >= np.percentile(emergency_density, 75)) &
186
+ (treatment_density >= np.percentile(treatment_density, 75))
187
+ )),
188
+ 'precision_estimate': float(sum(
189
+ (emergency_density > 0) & (treatment_density > 0)
190
+ ) / len(df))
191
  }
192
 
193
  # Print detailed results
194
  print("\n Results:")
195
+ print(f" - Average emergency keyword density (per 1000 words): {stats['path_b_validation']['avg_emergency_density']:.2f}")
196
+ print(f" - Average treatment keyword density (per 1000 words): {stats['path_b_validation']['avg_treatment_density']:.2f}")
197
+ print(f" - High-density records (top 25% in both): {stats['path_b_validation']['high_density_records']}")
198
  print(f" - Precision estimate: {stats['path_b_validation']['precision_estimate']:.2f}")
199
 
200
  # Sample distribution analysis
201
  print("\n Density Distribution:")
202
  density_counts = pd.DataFrame({
203
+ 'emergency': pd.qcut(emergency_density, q=4, labels=['Low', 'Medium-Low', 'Medium-High', 'High']),
204
+ 'treatment': pd.qcut(treatment_density, q=4, labels=['Low', 'Medium-Low', 'Medium-High', 'High'])
205
  }).value_counts().head()
206
  print(" Top 5 density combinations (emergency, treatment):")
207
  for (em, tr), count in density_counts.items():
208
+ print(f" - {count} documents have {em} emergency and {tr} treatment density")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
209
 
210
  # Visualization
211
+ print("\n8️⃣ Generating visualizations...")
212
  output_plots = output_dir / "plots"
213
  output_plots.mkdir(parents=True, exist_ok=True)
214
 
215
+ # 1. Keyword density scatter plot with improved visualization
216
+ plt.figure(figsize=(12, 8))
217
+ plt.scatter(
218
+ emergency_density,
219
+ treatment_density,
220
+ alpha=0.6,
221
+ c=np.log1p(df['text_length']), # Color by log text length
222
+ cmap='viridis'
223
+ )
224
+ plt.colorbar(label='Log Text Length')
225
+ plt.xlabel('Emergency Keyword Density (per 1000 words)')
226
+ plt.ylabel('Treatment Keyword Density (per 1000 words)')
227
+ plt.title('Emergency vs Treatment Keyword Density')
228
+ plt.grid(True, alpha=0.3)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
229
 
230
+ # Add mean lines
231
+ plt.axvline(x=np.mean(emergency_density), color='r', linestyle='--', alpha=0.5, label='Mean Emergency Density')
232
+ plt.axhline(y=np.mean(treatment_density), color='g', linestyle='--', alpha=0.5, label='Mean Treatment Density')
 
 
 
 
233
  plt.legend()
 
 
234
 
235
+ plt.savefig(output_plots / "keyword_density_scatter.png", bbox_inches='tight', dpi=300)
 
 
 
 
 
 
 
236
  plt.close()
237
 
238
  # Save comprehensive statistics
239
+ print("\n9️⃣ Saving analysis results...")
240
  stats_dir = output_dir / "stats"
241
  stats_dir.mkdir(parents=True, exist_ok=True)
242
 
243
  with open(stats_dir / "treatment_analysis_comprehensive.json", 'w', encoding='utf-8') as f:
244
  json.dump(stats, f, indent=2, ensure_ascii=False)
245
 
 
 
 
 
 
246
  print(f"✅ Treatment subset analysis complete!")
247
  print(f" Results saved to: {output_dir}")
248
  print(f" Plots: {output_plots}")