Spaces:

ybchen928
/

oncall-guide-ai

Sleeping

App Files Files Community

YanBoChen commited on about 1 month ago

Commit

654aa66

1 Parent(s): 04a03be

feat: update treatment analysis with keyword density calculations and enhanced visualization(test previous 2 dataset, especially treatment_subset)

Browse files

Files changed (2) hide show

dataset/analysis_treatment/stats/treatment_analysis_comprehensive.json +218 -101
dataset/scripts/data_explorer_treatment.py +62 -126

dataset/analysis_treatment/stats/treatment_analysis_comprehensive.json CHANGED Viewed

@@ -3,7 +3,7 @@
     "total_records": 9367,
     "avg_text_length": 27179.22952919825,
     "emergency_keywords_count": 47,
-    "treatment_keywords_count": 118
   },
   "emergency_keyword_stats": {
     "Acute abdomen": 51,
@@ -55,122 +55,239 @@
     "Ventricular fibrillation": 280
   },
   "treatment_keyword_stats": {
-    "iv fluids": 75,
-    "Infusion Intravenous": 3,
-    "fluid resuscitation": 115,
-    "normal saline": 252,
-    "crystalloids": 45,
-    "vasopressors": 188,
-    "Vasoconstrictor Agents": 2,
-    "Epinephrine": 806,
-    "Ondansetron": 43,
-    "Ibuprofen": 269,
-    "Morphine": 289,
-    "Lidocaine": 212,
     "Airway Management": 174,
-    "intubation": 493,
-    "Intubation Intratracheal": 0,
-    "ventilation support": 14,
-    "Ventilators": 86,
-    "oxygen therapy": 178,
-    "Oxygen Inhalation Therapy": 2,
-    "cpap": 84,
-    "Continuous Positive Airway Pressure": 84,
-    "bipap": 25,
     "Bi-level Positive Airway Pressure": 6,
-    "Nebulization": 41,
-    "cpr": 151,
     "Cardiopulmonary Resuscitation": 131,
-    "ACLS": 30,
-    "Advanced Cardiac Life Support": 34,
-    "Defibrillation": 96,
     "Cardioversion": 142,
-    "Blood Transfusion": 379,
-    "transfusion": 826,
-    "hemodynamic monitoring": 43,
-    "Hemodynamics": 135,
-    "central line placement": 6,
-    "Catheterization Central Venous": 0,
-    "arterial line placement": 0,
     "Catheterization Arterial": 0,
-    "Hemostasis": 180,
-    "wound care": 73,
-    "Wound Management": 37,
-    "Suturing": 53,
-    "Tourniquet": 56,
     "compression dressing": 2,
-    "Wound Dressing": 30,
-    "splinting": 26,
-    "Splints": 29,
-    "radiologic imaging": 5,
-    "Radiography": 218,
-    "point-of-care ultrasound": 13,
-    "POCUS": 10,
-    "Ultrasonography Point-of-Care": 0,
-    "x-ray": 1293,
     "ct scan": 1036,
-    "Tomography X-Ray Computed": 0,
-    "laboratory testing": 296,
-    "Laboratory Techniques": 29,
-    "Sedation": 602,
-    "analgesia": 323,
-    "Analgesia": 323,
-    "procedural sedation": 26,
-    "Anesthesia Procedural": 0,
-    "ketamine": 86,
-    "Ketamine": 86,
-    "midazolam": 204,
-    "Midazolam": 204,
-    "supportive care": 564,
-    "Supportive Care": 564,
-    "monitoring": 3593,
-    "Patient Monitoring": 107,
-    "vital signs monitoring": 1,
-    "Vital Signs": 459,
     "icu transfer": 9,
-    "Intensive Care Units": 155,
-    "treatment": 7719,
-    "Therapeutics": 182,
     "manage": 4416,
-    "Patient Management": 281,
     "management": 4008,
-    "intervention": 2695,
-    "Therapeutic Intervention": 181,
-    "Therapy": 6117,
     "medication": 4698,
-    "Drug Therapy": 773,
-    "procedure": 3073,
-    "Surgical Procedures Operative": 0,
-    "resuscitation": 539,
-    "administer": 3881,
-    "Drug Administration Routes": 0,
-    "dose": 5344,
-    "Dosage Forms": 210,
     "monitor": 4521,
-    "Oxygen": 1779,
-    "fluid": 2938,
-    "surgery": 3531,
-    "Surgical Procedures": 482,
-    "antibiotic": 1922,
-    "Anti-Bacterial Agents": 1,
-    "Dopamine": 389,
-    "Amiodarone": 315,
-    "levophed": 11,
-    "Norepinephrine": 392,
-    "Bosmin": 0,
-    "Adrenaline": 135,
-    "Insulin": 808,
     "nitroglycerin": 125,
     "NTG": 81,
-    "beta blocker": 297,
-    "alpha blocker": 35
   },
-  "cooccurrence_analysis": [],
   "path_b_validation": {
-    "avg_emergency_density": 0.0,
-    "avg_treatment_density": 4.9375467065229,
-    "high_density_records": 0,
-    "precision_estimate": 0.0
   },
   "condition_mapping_candidates": {}
 }

     "total_records": 9367,
     "avg_text_length": 27179.22952919825,
     "emergency_keywords_count": 47,
+    "treatment_keywords_count": 105
   },
   "emergency_keyword_stats": {
     "Acute abdomen": 51,
     "Ventricular fibrillation": 280
   },
   "treatment_keyword_stats": {
+    "ACLS": 30,
+    "administer": 3881,
+    "Adrenaline": 135,
+    "Advanced Cardiac Life Support": 34,
     "Airway Management": 174,
+    "alpha blocker": 35,
+    "Amiodarone": 315,
+    "analgesia": 323,
+    "Anesthesia Procedural": 0,
+    "Anti-Bacterial Agents": 1,
+    "antibiotic": 1922,
+    "arterial line placement": 0,
+    "beta blocker": 297,
     "Bi-level Positive Airway Pressure": 6,
+    "bipap": 25,
+    "Blood Transfusion": 379,
+    "Bosmin": 0,
     "Cardiopulmonary Resuscitation": 131,
     "Cardioversion": 142,
     "Catheterization Arterial": 0,
+    "Catheterization Central Venous": 0,
+    "central line placement": 6,
     "compression dressing": 2,
+    "Computed Tomography": 518,
+    "cpap": 84,
+    "cpr": 151,
+    "crystalloids": 45,
     "ct scan": 1036,
+    "Defibrillation": 96,
+    "Dopamine": 389,
+    "Dosage Forms": 210,
+    "dose": 5344,
+    "Drug Administration Routes": 0,
+    "Drug Therapy": 773,
+    "Epinephrine": 806,
+    "fluid": 2938,
+    "fluid resuscitation": 115,
+    "hemodynamic monitoring": 43,
+    "Hemodynamics": 135,
+    "Hemostasis": 180,
+    "Ibuprofen": 269,
     "icu transfer": 9,
+    "Insulin": 808,
+    "intervention": 2695,
+    "intubation": 493,
+    "Intratracheal Intubation": 3,
+    "Intravenous Infusion": 576,
+    "iv fluids": 75,
+    "laboratory techniques": 29,
+    "laboratory testing": 296,
+    "levophed": 11,
+    "Lidocaine": 212,
     "manage": 4416,
     "management": 4008,
     "medication": 4698,
+    "midazolam": 204,
     "monitor": 4521,
+    "monitoring": 3593,
+    "Morphine": 289,
+    "Nebulization": 41,
     "nitroglycerin": 125,
     "NTG": 81,
+    "Norepinephrine": 392,
+    "normal saline": 252,
+    "Ondansetron": 43,
+    "Oxygen": 1779,
+    "Oxygen Inhalation Therapy": 2,
+    "oxygen therapy": 178,
+    "Patient Management": 281,
+    "Patient Monitoring": 107,
+    "POCUS": 10,
+    "point of care ultrasound": 2,
+    "procedural sedation": 26,
+    "procedure": 3073,
+    "radiologic imaging": 5,
+    "Radiography": 218,
+    "resuscitation": 539,
+    "Sedation": 602,
+    "splinting": 26,
+    "Splints": 29,
+    "supportive care": 564,
+    "surgical procedures": 482,
+    "Surgical Procedures Operative": 0,
+    "surgery": 3531,
+    "Suture": 179,
+    "Suturing": 53,
+    "Therapeutic Intervention": 181,
+    "Therapeutics": 182,
+    "Therapy": 6117,
+    "tourniquet": 56,
+    "transfusion": 826,
+    "treat": 8270,
+    "treatment": 7719,
+    "Ultrasonography Point of Care": 0,
+    "ultrasound": 1273,
+    "Vasoconstrictor Agents": 2,
+    "vasopressors": 188,
+    "ventilation support": 14,
+    "Ventilators": 86,
+    "Vital Signs": 459,
+    "vital signs monitoring": 1,
+    "wound care": 73,
+    "Wound Dressing": 30,
+    "Wound Management": 37,
+    "X-Ray": 1293
   },
+  "cooccurrence_analysis": [
+    {
+      "emergency_keyword": "Fever",
+      "treatment_keyword": "treatment",
+      "cooccurrence_count": 3488,
+      "percentage": 37.23710899967973
+    },
+    {
+      "emergency_keyword": "Fever",
+      "treatment_keyword": "Therapy",
+      "cooccurrence_count": 2698,
+      "percentage": 28.803245436105477
+    },
+    {
+      "emergency_keyword": "Fever",
+      "treatment_keyword": "dose",
+      "cooccurrence_count": 2430,
+      "percentage": 25.94213729048788
+    },
+    {
+      "emergency_keyword": "Fever",
+      "treatment_keyword": "medication",
+      "cooccurrence_count": 1979,
+      "percentage": 21.127362015586634
+    },
+    {
+      "emergency_keyword": "Hypotension",
+      "treatment_keyword": "treatment",
+      "cooccurrence_count": 1760,
+      "percentage": 18.789366926443897
+    },
+    {
+      "emergency_keyword": "Fever",
+      "treatment_keyword": "management",
+      "cooccurrence_count": 1753,
+      "percentage": 18.714636489804633
+    },
+    {
+      "emergency_keyword": "Fever",
+      "treatment_keyword": "treat",
+      "cooccurrence_count": 1744,
+      "percentage": 18.618554499839863
+    },
+    {
+      "emergency_keyword": "Fever",
+      "treatment_keyword": "monitoring",
+      "cooccurrence_count": 1674,
+      "percentage": 17.87125013344721
+    },
+    {
+      "emergency_keyword": "Hypotension",
+      "treatment_keyword": "Therapy",
+      "cooccurrence_count": 1558,
+      "percentage": 16.63286004056795
+    },
+    {
+      "emergency_keyword": "Fever",
+      "treatment_keyword": "surgery",
+      "cooccurrence_count": 1505,
+      "percentage": 16.06704387744208
+    },
+    {
+      "emergency_keyword": "Tachycardia",
+      "treatment_keyword": "treatment",
+      "cooccurrence_count": 1441,
+      "percentage": 15.383794171025942
+    },
+    {
+      "emergency_keyword": "Hypotension",
+      "treatment_keyword": "dose",
+      "cooccurrence_count": 1423,
+      "percentage": 15.191630191096403
+    },
+    {
+      "emergency_keyword": "Myocardial Infarction",
+      "treatment_keyword": "treatment",
+      "cooccurrence_count": 1369,
+      "percentage": 14.615138251307783
+    },
+    {
+      "emergency_keyword": "Shock",
+      "treatment_keyword": "treatment",
+      "cooccurrence_count": 1340,
+      "percentage": 14.305540728087967
+    },
+    {
+      "emergency_keyword": "Fever",
+      "treatment_keyword": "fluid",
+      "cooccurrence_count": 1330,
+      "percentage": 14.198782961460447
+    },
+    {
+      "emergency_keyword": "Hemorrhage",
+      "treatment_keyword": "treatment",
+      "cooccurrence_count": 1328,
+      "percentage": 14.177431408134941
+    },
+    {
+      "emergency_keyword": "Hypotension",
+      "treatment_keyword": "monitoring",
+      "cooccurrence_count": 1325,
+      "percentage": 14.145404078146683
+    },
+    {
+      "emergency_keyword": "Tachycardia",
+      "treatment_keyword": "Therapy",
+      "cooccurrence_count": 1277,
+      "percentage": 13.632966798334579
+    },
+    {
+      "emergency_keyword": "Dyspnea",
+      "treatment_keyword": "treatment",
+      "cooccurrence_count": 1228,
+      "percentage": 13.10985374185972
+    },
+    {
+      "emergency_keyword": "Myocardial Infarction",
+      "treatment_keyword": "Therapy",
+      "cooccurrence_count": 1215,
+      "percentage": 12.97106864524394
+    }
+  ],
   "path_b_validation": {
+    "avg_emergency_density": 0.3098621434407273,
+    "avg_treatment_density": 0.6108515041451529,
+    "high_density_records": 1298,
+    "precision_estimate": 0.9995729689334899
   },
   "condition_mapping_candidates": {}
 }

dataset/scripts/data_explorer_treatment.py CHANGED Viewed

@@ -9,6 +9,19 @@ import numpy as np
 from tqdm import tqdm
 import re
 def analyze_treatment_subset(
     treatment_file_path,
     emergency_keywords_path,
@@ -98,7 +111,8 @@ def analyze_treatment_subset(
     # Process all emergency keywords
     print("\n   Processing all emergency keywords...")
     for i, keyword in enumerate(tqdm(emergency_keywords, desc="Emergency keywords")):
-        pattern = r'(?<!\w)' + re.escape(keyword) + r'(?!\w)'
         emergency_matrix[:, i] = df['clean_text_lower'].str.contains(pattern, regex=True, na=False)
         matches = emergency_matrix[:, i].sum()
         print(f"   - {keyword}: {matches} matches")
@@ -106,7 +120,8 @@ def analyze_treatment_subset(
     # Process all treatment keywords
     print("\n   Processing all treatment keywords...")
     for i, keyword in enumerate(tqdm(treatment_keywords, desc="Treatment keywords")):
-        pattern = r'(?<!\w)' + re.escape(keyword) + r'(?!\w)'
         treatment_matrix[:, i] = df['clean_text_lower'].str.contains(pattern, regex=True, na=False)
         matches = treatment_matrix[:, i].sum()
         print(f"   - {keyword}: {matches} matches")
@@ -145,168 +160,89 @@ def analyze_treatment_subset(
     # Compute keyword density with progress bar
     print("   Computing keyword density...")
     with tqdm(total=2, desc="Density calculation") as pbar:
-        emergency_density = emergency_matrix.sum(axis=1)
         pbar.update(1)
-        treatment_density = treatment_matrix.sum(axis=1)
         pbar.update(1)
-    # Store density in dataframe
     df['emergency_keyword_density'] = emergency_density
     df['treatment_keyword_density'] = treatment_density
-    # Calculate statistics
     stats['path_b_validation'] = {
         'avg_emergency_density': float(np.mean(emergency_density)),
         'avg_treatment_density': float(np.mean(treatment_density)),
-        'high_density_records': int(sum((emergency_density >= 2) & (treatment_density >= 2))),
-        'precision_estimate': float(sum((emergency_density >= 1) & (treatment_density >= 1)) / len(df))
     }
     # Print detailed results
     print("\n   Results:")
-    print(f"   - Average emergency keyword density: {stats['path_b_validation']['avg_emergency_density']:.2f}")
-    print(f"   - Average treatment keyword density: {stats['path_b_validation']['avg_treatment_density']:.2f}")
-    print(f"   - High-density records (≥2 each): {stats['path_b_validation']['high_density_records']}")
     print(f"   - Precision estimate: {stats['path_b_validation']['precision_estimate']:.2f}")
     # Sample distribution analysis
     print("\n   Density Distribution:")
     density_counts = pd.DataFrame({
-        'emergency': emergency_density,
-        'treatment': treatment_density
     }).value_counts().head()
     print("   Top 5 density combinations (emergency, treatment):")
     for (em, tr), count in density_counts.items():
-        print(f"   - {count} documents have {em} emergency and {tr} treatment keywords")
-    # Condition mapping candidates
-    print("\n8️⃣ Preparing condition mapping candidates...")
-    # Group emergency keywords by potential conditions
-    condition_candidates = {}
-    for pair in cooccurrence_pairs[:10]:  # Top 10 pairs
-        em_kw = pair['emergency_keyword']
-        tr_kw = pair['treatment_keyword']
-        # Simple condition inference (can be enhanced later)
-        if any(cardiac_term in em_kw.lower() for cardiac_term in ['mi', 'cardiac', 'heart', 'chest']):
-            condition = 'cardiac'
-        elif any(resp_term in em_kw.lower() for resp_term in ['respiratory', 'breathing', 'lung', 'dyspnea']):
-            condition = 'respiratory'
-        elif any(neuro_term in em_kw.lower() for neuro_term in ['stroke', 'seizure', 'consciousness']):
-            condition = 'neurological'
-        else:
-            condition = 'general'
-        if condition not in condition_candidates:
-            condition_candidates[condition] = []
-        condition_candidates[condition].append({
-            'emergency_keyword': em_kw,
-            'treatment_keyword': tr_kw,
-            'strength': pair['cooccurrence_count']
-        })
-    stats['condition_mapping_candidates'] = condition_candidates
     # Visualization
-    print("\n9️⃣ Generating visualizations...")
     output_plots = output_dir / "plots"
     output_plots.mkdir(parents=True, exist_ok=True)
-    # 1. Dual keyword distribution
-    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 8))
-    # Emergency keywords in treatment subset
-    em_counts = list(stats['emergency_keyword_stats'].values())
-    em_labels = list(stats['emergency_keyword_stats'].keys())
-    ax1.bar(range(len(em_labels)), em_counts)
-    ax1.set_title('Emergency Keywords in Treatment Subset')
-    ax1.set_xlabel('Emergency Keywords')
-    ax1.set_ylabel('Document Count')
-    ax1.tick_params(axis='x', rotation=45, labelsize=8)
-    ax1.set_xticks(range(len(em_labels)))
-    ax1.set_xticklabels(em_labels, ha='right')
-    # Treatment keywords
-    tr_counts = list(stats['treatment_keyword_stats'].values())
-    tr_labels = list(stats['treatment_keyword_stats'].keys())
-    ax2.bar(range(len(tr_labels)), tr_counts)
-    ax2.set_title('Treatment Keywords Distribution')
-    ax2.set_xlabel('Treatment Keywords')
-    ax2.set_ylabel('Document Count')
-    ax2.tick_params(axis='x', rotation=45, labelsize=8)
-    ax2.set_xticks(range(len(tr_labels)))
-    ax2.set_xticklabels(tr_labels, ha='right')
-    plt.tight_layout()
-    plt.savefig(output_plots / "dual_keyword_distribution.png", bbox_inches='tight', dpi=300)
-    plt.close()
-    # 2. Co-occurrence heatmap (top pairs)
-    if len(cooccurrence_pairs) > 0:
-        top_pairs = cooccurrence_pairs[:15]  # Top 15 for readability
-        cooc_matrix = np.zeros((len(set([p['emergency_keyword'] for p in top_pairs])),
-                               len(set([p['treatment_keyword'] for p in top_pairs]))))
-        em_unique = list(set([p['emergency_keyword'] for p in top_pairs]))
-        tr_unique = list(set([p['treatment_keyword'] for p in top_pairs]))
-        for pair in top_pairs:
-            i = em_unique.index(pair['emergency_keyword'])
-            j = tr_unique.index(pair['treatment_keyword'])
-            cooc_matrix[i, j] = pair['cooccurrence_count']
-        plt.figure(figsize=(12, 8))
-        sns.heatmap(cooc_matrix,
-                   xticklabels=tr_unique,
-                   yticklabels=em_unique,
-                   annot=True,
-                   fmt='g',
-                   cmap='YlOrRd')
-        plt.title('Emergency-Treatment Keywords Co-occurrence Heatmap')
-        plt.xlabel('Treatment Keywords')
-        plt.ylabel('Emergency Keywords')
-        plt.xticks(rotation=45, ha='right')
-        plt.yticks(rotation=0)
-        plt.tight_layout()
-        plt.savefig(output_plots / "cooccurrence_heatmap.png", bbox_inches='tight', dpi=300)
-        plt.close()
-    # 3. Text length distribution
-    plt.figure(figsize=(10, 6))
-    df['text_length'].hist(bins=50, alpha=0.7)
-    plt.title('Text Length Distribution in Treatment Subset')
-    plt.xlabel('Text Length (characters)')
-    plt.ylabel('Frequency')
-    plt.axvline(avg_length, color='red', linestyle='--', label=f'Average: {avg_length:.0f}')
     plt.legend()
-    plt.savefig(output_plots / "text_length_distribution.png", bbox_inches='tight')
-    plt.close()
-    # 4. Keyword density scatter plot
-    plt.figure(figsize=(10, 8))
-    plt.scatter(df['emergency_keyword_density'], df['treatment_keyword_density'], alpha=0.6)
-    plt.xlabel('Emergency Keyword Density')
-    plt.ylabel('Treatment Keyword Density')
-    plt.title('Emergency vs Treatment Keyword Density')
-    plt.grid(True, alpha=0.3)
-    plt.savefig(output_plots / "keyword_density_scatter.png", bbox_inches='tight')
     plt.close()
     # Save comprehensive statistics
-    print("\n🔟 Saving analysis results...")
     stats_dir = output_dir / "stats"
     stats_dir.mkdir(parents=True, exist_ok=True)
     with open(stats_dir / "treatment_analysis_comprehensive.json", 'w', encoding='utf-8') as f:
         json.dump(stats, f, indent=2, ensure_ascii=False)
-    # Save co-occurrence pairs as CSV for easy review
-    if cooccurrence_pairs:
-        cooc_df = pd.DataFrame(cooccurrence_pairs)
-        cooc_df.to_csv(stats_dir / "cooccurrence_pairs.csv", index=False)
     print(f"✅ Treatment subset analysis complete!")
     print(f"   Results saved to: {output_dir}")
     print(f"   Plots: {output_plots}")

 from tqdm import tqdm
 import re
+def calculate_density(matches, text_length):
+    """
+    Calculate keyword density per 1000 words
+    Args:
+        matches: Number of keyword matches
+        text_length: Total text length
+    Returns:
+        float: Density per 1000 words
+    """
+    return (matches / text_length) * 1000
 def analyze_treatment_subset(
     treatment_file_path,
     emergency_keywords_path,
     # Process all emergency keywords
     print("\n   Processing all emergency keywords...")
     for i, keyword in enumerate(tqdm(emergency_keywords, desc="Emergency keywords")):
+        # Using word boundary instead of negative lookbehind/lookahead
+        pattern = r'\b' + re.escape(keyword.lower()) + r'\b'
         emergency_matrix[:, i] = df['clean_text_lower'].str.contains(pattern, regex=True, na=False)
         matches = emergency_matrix[:, i].sum()
         print(f"   - {keyword}: {matches} matches")
     # Process all treatment keywords
     print("\n   Processing all treatment keywords...")
     for i, keyword in enumerate(tqdm(treatment_keywords, desc="Treatment keywords")):
+        # Using word boundary instead of negative lookbehind/lookahead
+        pattern = r'\b' + re.escape(keyword.lower()) + r'\b'
         treatment_matrix[:, i] = df['clean_text_lower'].str.contains(pattern, regex=True, na=False)
         matches = treatment_matrix[:, i].sum()
         print(f"   - {keyword}: {matches} matches")
     # Compute keyword density with progress bar
     print("   Computing keyword density...")
     with tqdm(total=2, desc="Density calculation") as pbar:
+        # Calculate density per 1000 words for both emergency and treatment keywords
+        emergency_density = calculate_density(
+            emergency_matrix.sum(axis=1),
+            df['text_length']
+        )
         pbar.update(1)
+        treatment_density = calculate_density(
+            treatment_matrix.sum(axis=1),
+            df['text_length']
+        )
         pbar.update(1)
+    # Store density in dataframe for visualization
     df['emergency_keyword_density'] = emergency_density
     df['treatment_keyword_density'] = treatment_density
+    # Calculate statistics with the new density metrics
     stats['path_b_validation'] = {
         'avg_emergency_density': float(np.mean(emergency_density)),
         'avg_treatment_density': float(np.mean(treatment_density)),
+        'high_density_records': int(sum(
+            (emergency_density >= np.percentile(emergency_density, 75)) &
+            (treatment_density >= np.percentile(treatment_density, 75))
+        )),
+        'precision_estimate': float(sum(
+            (emergency_density > 0) & (treatment_density > 0)
+        ) / len(df))
     }
     # Print detailed results
     print("\n   Results:")
+    print(f"   - Average emergency keyword density (per 1000 words): {stats['path_b_validation']['avg_emergency_density']:.2f}")
+    print(f"   - Average treatment keyword density (per 1000 words): {stats['path_b_validation']['avg_treatment_density']:.2f}")
+    print(f"   - High-density records (top 25% in both): {stats['path_b_validation']['high_density_records']}")
     print(f"   - Precision estimate: {stats['path_b_validation']['precision_estimate']:.2f}")
     # Sample distribution analysis
     print("\n   Density Distribution:")
     density_counts = pd.DataFrame({
+        'emergency': pd.qcut(emergency_density, q=4, labels=['Low', 'Medium-Low', 'Medium-High', 'High']),
+        'treatment': pd.qcut(treatment_density, q=4, labels=['Low', 'Medium-Low', 'Medium-High', 'High'])
     }).value_counts().head()
     print("   Top 5 density combinations (emergency, treatment):")
     for (em, tr), count in density_counts.items():
+        print(f"   - {count} documents have {em} emergency and {tr} treatment density")
     # Visualization
+    print("\n8️⃣ Generating visualizations...")
     output_plots = output_dir / "plots"
     output_plots.mkdir(parents=True, exist_ok=True)
+    # 1. Keyword density scatter plot with improved visualization
+    plt.figure(figsize=(12, 8))
+    plt.scatter(
+        emergency_density,
+        treatment_density,
+        alpha=0.6,
+        c=np.log1p(df['text_length']),  # Color by log text length
+        cmap='viridis'
+    )
+    plt.colorbar(label='Log Text Length')
+    plt.xlabel('Emergency Keyword Density (per 1000 words)')
+    plt.ylabel('Treatment Keyword Density (per 1000 words)')
+    plt.title('Emergency vs Treatment Keyword Density')
+    plt.grid(True, alpha=0.3)
+    # Add mean lines
+    plt.axvline(x=np.mean(emergency_density), color='r', linestyle='--', alpha=0.5, label='Mean Emergency Density')
+    plt.axhline(y=np.mean(treatment_density), color='g', linestyle='--', alpha=0.5, label='Mean Treatment Density')
     plt.legend()
+    plt.savefig(output_plots / "keyword_density_scatter.png", bbox_inches='tight', dpi=300)
     plt.close()
     # Save comprehensive statistics
+    print("\n9️⃣ Saving analysis results...")
     stats_dir = output_dir / "stats"
     stats_dir.mkdir(parents=True, exist_ok=True)
     with open(stats_dir / "treatment_analysis_comprehensive.json", 'w', encoding='utf-8') as f:
         json.dump(stats, f, indent=2, ensure_ascii=False)
     print(f"✅ Treatment subset analysis complete!")
     print(f"   Results saved to: {output_dir}")
     print(f"   Plots: {output_plots}")