Spaces:

ybchen928
/

oncall-guide-ai

Sleeping

YanBoChen commited on about 1 month ago

Commit

8de0937

1 Parent(s): 9829a46

refactor: migrate special terms to JSON configuration

BREAKING CHANGE: Move hardcoded special terms mapping to external JSON files

1. Create New Configuration Files:
- Add special_terms_emergency.json
- Organize emergency terms by categories (cardiac, respiratory, etc.)
- Include all existing mappings with standardized structure
- Add special_terms_treatment.json
- Organize treatment terms by categories (imaging, medications, etc.)
- Maintain all existing term variants

2. Update Processing Scripts:
- Modify 01_filter_emergency_opt.py:
- Load terms from JSON configuration
- Add term standardization
- Implement deduplication
- Preserve category information
- Modify 02_filter_treatment_opt.py:
- Similar updates for treatment terms
- Maintain consistent processing logic

3. New Features:
- Term standardization: Convert variants to standard form
- Deduplication: Remove repeated terms while preserving order
- Category-aware: Support for term categorization
- Improved maintainability: Configuration separated from code

4. Technical Details:
- Use pathlib for file path handling
- JSON structure supports hierarchical organization
- Maintain backward compatibility
- Add type hints for better code clarity

Testing:
- Verify JSON format
- Confirm all mappings migrated correctly
- Check term standardization
- Validate deduplication logic

Files changed (8) hide show

dataset/keywords/special_terms_emergency.json +26 -0
dataset/keywords/special_terms_treatment.json +25 -0
dataset/scripts/01_filter_emergency_opt.py +37 -27
dataset/scripts/02_filter_treatment_opt.py +131 -0
dataset/scripts/commit_message_20250726_special_terms.txt +39 -0
dataset/scripts/compare_subsets_opt.py +124 -0
dataset/scripts/data_explorer_opt.py +118 -0
dataset/scripts/data_explorer_treatment_opt.py +263 -0

dataset/keywords/special_terms_emergency.json ADDED Viewed

	@@ -0,0 +1,26 @@

+{
+    "cardiac": {
+        "mi": ["mi", "m.i.", "myocardial infarction", "MI"],
+        "acs": ["acs", "ACS", "acute coronary syndrome"]
+    },
+    "respiratory": {
+        "ards": ["ards", "ARDS", "acute respiratory distress syndrome"],
+        "respiratory_failure": ["respiratory failure", "resp failure", "RF"]
+    },
+    "neurological": {
+        "loc": ["loc", "LOC", "loss of consciousness"],
+        "cva": ["cva", "CVA", "stroke", "cerebrovascular accident"]
+    },
+    "shock": {
+        "shock": ["shock", "circulatory failure"],
+        "septic_shock": ["septic shock", "sepsis induced shock"]
+    },
+    "bleeding": {
+        "gi_bleed": ["gi bleed", "gi bleeding", "gastrointestinal hemorrhage", "GI hemorrhage"],
+        "hemorrhage": ["hemorrhage", "bleeding", "blood loss"]
+    },
+    "vital_signs": {
+        "hypotension": ["hypotension", "low bp", "low blood pressure"],
+        "tachycardia": ["tachycardia", "elevated heart rate", "fast heart rate"]
+    }
+}

dataset/keywords/special_terms_treatment.json ADDED Viewed

	@@ -0,0 +1,25 @@

+{
+    "imaging": {
+        "x-ray": ["x-ray", "x ray", "xray", "XR"],
+        "ct": ["ct", "ct-scan", "cat scan", "computed tomography"],
+        "us": ["us", "u/s", "ultrasound", "sonography"]
+    },
+    "medications": {
+        "iv": ["iv", "i.v.", "intravenous"],
+        "im": ["im", "i.m.", "intramuscular"],
+        "po": ["po", "p.o.", "per os", "by mouth"]
+    },
+    "procedures": {
+        "cpr": ["cpr", "CPR", "cardiopulmonary resuscitation"],
+        "intubation": ["intubation", "ETT", "endotracheal tube"],
+        "cardioversion": ["cardioversion", "electrical cardioversion"]
+    },
+    "monitoring": {
+        "ecg": ["ecg", "ekg", "electrocardiogram"],
+        "monitoring": ["monitoring", "continuous observation"]
+    },
+    "ventilation": {
+        "bipap": ["bipap", "BiPAP", "bi-level positive airway pressure"],
+        "cpap": ["cpap", "CPAP", "continuous positive airway pressure"]
+    }
+}

dataset/scripts/01_filter_emergency_opt.py CHANGED Viewed

@@ -1,36 +1,20 @@
 import os
 import re
 import pandas as pd
-# Medical term processor class for handling special terms
 class MedicalTermProcessor:
     def __init__(self):
-        # Emergency special terms mapping
-        self.emergency_special_terms = {
-            # Cardiac
-            'mi': ['mi', 'm.i.', 'myocardial infarction', 'MI'],
-            'acs': ['acs', 'ACS', 'acute coronary syndrome'],
-            # Respiratory
-            'ards': ['ards', 'ARDS', 'acute respiratory distress syndrome'],
-            'respiratory_failure': ['respiratory failure', 'resp failure', 'RF'],
-            # Neurological
-            'loc': ['loc', 'LOC', 'loss of consciousness'],
-            'cva': ['cva', 'CVA', 'stroke', 'cerebrovascular accident'],
-            # Shock States
-            'shock': ['shock', 'circulatory failure'],
-            'septic_shock': ['septic shock', 'sepsis induced shock'],
-            # Bleeding
-            'gi_bleed': ['gi bleed', 'gi bleeding', 'gastrointestinal hemorrhage', 'GI hemorrhage'],
-            'hemorrhage': ['hemorrhage', 'bleeding', 'blood loss'],
-            # Vital Signs
-            'hypotension': ['hypotension', 'low bp', 'low blood pressure'],
-            'tachycardia': ['tachycardia', 'elevated heart rate', 'fast heart rate']
-        }
     def get_all_variants(self):
         """Get all term variants including special terms"""
@@ -39,6 +23,32 @@ class MedicalTermProcessor:
             variants.extend(term_list)
         return variants
 # Function: Load keywords and print progress
 def load_keywords(path, processor):
     print(f"📥 Loading keywords from: {path}")
@@ -70,7 +80,7 @@ df["matched"] = (
     df["clean_text"]
       .fillna("")  # Convert NaN to empty string
       .str.findall(pattern, flags=re.IGNORECASE)
-      .apply(lambda lst: "|".join(lst) if lst else "")
 )
 df["has_emergency"] = df["matched"].str.len() > 0

 import os
 import re
+import json
 import pandas as pd
+from pathlib import Path
 class MedicalTermProcessor:
     def __init__(self):
+        # Load emergency special terms from JSON
+        keywords_dir = Path("../keywords")
+        with open(keywords_dir / "special_terms_emergency.json", "r") as f:
+            self.emergency_terms_by_category = json.load(f)
+        # Flatten the nested structure for easy lookup
+        self.emergency_special_terms = {}
+        for category in self.emergency_terms_by_category.values():
+            self.emergency_special_terms.update(category)
     def get_all_variants(self):
         """Get all term variants including special terms"""
             variants.extend(term_list)
         return variants
+    def standardize_term(self, term: str) -> str:
+        """Convert a term to its standard form if it's a variant"""
+        term_lower = term.lower()
+        for standard_term, variants in self.emergency_special_terms.items():
+            if term_lower in [v.lower() for v in variants]:
+                return standard_term
+        return term
+    def process_matches(self, matches: list) -> str:
+        """Process matches to standardize terms and remove duplicates"""
+        if not matches:
+            return ""
+        # Standardize terms
+        standardized = [self.standardize_term(match) for match in matches]
+        # Remove duplicates while preserving order
+        seen = set()
+        unique_matches = []
+        for term in standardized:
+            if term.lower() not in seen:
+                unique_matches.append(term)
+                seen.add(term.lower())
+        return "|".join(unique_matches)
 # Function: Load keywords and print progress
 def load_keywords(path, processor):
     print(f"📥 Loading keywords from: {path}")
     df["clean_text"]
       .fillna("")  # Convert NaN to empty string
       .str.findall(pattern, flags=re.IGNORECASE)
+      .apply(lambda matches: processor.process_matches(matches))  # Use new process_matches method
 )
 df["has_emergency"] = df["matched"].str.len() > 0

dataset/scripts/02_filter_treatment_opt.py ADDED Viewed

	@@ -0,0 +1,131 @@

+import os
+import re
+import json
+import pandas as pd
+from pathlib import Path
+class MedicalTermProcessor:
+    def __init__(self):
+        # Load treatment special terms from JSON
+        keywords_dir = Path("../keywords")
+        with open(keywords_dir / "special_terms_treatment.json", "r") as f:
+            self.treatment_terms_by_category = json.load(f)
+        # Flatten the nested structure for easy lookup
+        self.treatment_special_terms = {}
+        for category in self.treatment_terms_by_category.values():
+            self.treatment_special_terms.update(category)
+    def get_all_variants(self):
+        """Get all term variants including special terms"""
+        variants = []
+        for term_list in self.treatment_special_terms.values():
+            variants.extend(term_list)
+        return variants
+    def standardize_term(self, term: str) -> str:
+        """Convert a term to its standard form if it's a variant"""
+        term_lower = term.lower()
+        for standard_term, variants in self.treatment_special_terms.items():
+            if term_lower in [v.lower() for v in variants]:
+                return standard_term
+        return term
+    def process_matches(self, matches: list) -> str:
+        """Process matches to standardize terms and remove duplicates"""
+        if not matches:
+            return ""
+        # Standardize terms
+        standardized = [self.standardize_term(match) for match in matches]
+        # Remove duplicates while preserving order
+        seen = set()
+        unique_matches = []
+        for term in standardized:
+            if term.lower() not in seen:
+                unique_matches.append(term)
+                seen.add(term.lower())
+        return "|".join(unique_matches)
+def load_keywords(path, processor):
+    """Load and preprocess treatment keywords"""
+    print(f"📥 Loading keywords from: {path}")
+    # Load basic keywords
+    with open(path, "r", encoding="utf-8") as f:
+        basic_kws = [line.strip() for line in f if line.strip()]
+    # Add special term variants
+    special_kws = processor.get_all_variants()
+    all_kws = list(set(basic_kws + special_kws))  # Remove duplicates
+    print(f"   Loaded {len(all_kws)} keywords (including variants)")
+    return all_kws
+# Step 1: Read optimized emergency subset
+print("1️⃣ Reading optimized emergency subset...")
+emergency_path = "../dataset/emergency/emergency_subset_opt.jsonl"
+df = pd.read_json(emergency_path, lines=True)
+print(f"   Loaded {len(df)} emergency records")
+print(f"   Contains emergency keywords in 'matched' column")
+# Step 2: Load treatment keywords and match
+print("2️⃣ Loading treatment keywords and matching...")
+processor = MedicalTermProcessor()
+keywords = load_keywords("../keywords/treatment_keywords.txt", processor)
+pattern = r"\b(?:" + "|".join(map(re.escape, keywords)) + r")\b"
+# Step 3: Process text and match keywords
+print("3️⃣ Processing text and matching keywords...")
+# Match treatment keywords and add metadata columns
+df["treatment_matched"] = (
+    df["clean_text"]
+      .fillna("")  # Convert NaN to empty string
+      .str.findall(pattern, flags=re.IGNORECASE)
+      .apply(lambda matches: processor.process_matches(matches))  # Use new process_matches method
+)
+df["has_treatment"] = df["treatment_matched"].str.len() > 0
+# Add metadata columns for future use
+df["type"] = "treatment"  # Document type identifier
+df["condition"] = ""      # Reserved for future condition mapping
+# Verify columns
+print("   Verifying columns...")
+print(f"   - Emergency keywords column (matched): {df['matched'].notna().sum()} records")
+print(f"   - Treatment keywords column (treatment_matched): {df['treatment_matched'].notna().sum()} records")
+# Calculate statistics
+cnt_treat = df["has_treatment"].sum()
+avg_matches = (
+    df[df["has_treatment"]]["treatment_matched"]
+      .str.count(r"\|")
+      .add(1)
+      .mean()
+)
+print(f"   Found {cnt_treat} treatment-related records")
+print(f"   Average treatment keywords per record: {avg_matches:.2f}")
+# Step 4: Save treatment subset
+print("4️⃣ Saving treatment subset...")
+out_dir = "../dataset/emergency_treatment"
+os.makedirs(out_dir, exist_ok=True)
+# Select records with treatment keywords
+subset = df[df["has_treatment"]].copy()  # Use copy to avoid SettingWithCopyWarning
+# Verify final subset columns
+print("   Final subset columns:")
+print(f"   - Emergency keywords (matched): {subset['matched'].notna().sum()} records")
+print(f"   - Treatment keywords (treatment_matched): {subset['treatment_matched'].notna().sum()} records")
+# Save with _opt suffix
+subset.to_json(f"{out_dir}/emergency_treatment_subset_opt.jsonl", orient="records", lines=True)
+subset.to_csv(f"{out_dir}/emergency_treatment_subset_opt.csv", index=False)
+print(f"✅ Generated optimized treatment subset with {len(subset)} records")
+print(f"   Saved in: {out_dir}")
+print(f"   Contains both emergency and treatment keywords")

dataset/scripts/commit_message_20250726_special_terms.txt ADDED Viewed

	@@ -0,0 +1,39 @@

+refactor: migrate special terms to JSON configuration
+BREAKING CHANGE: Move hardcoded special terms mapping to external JSON files
+1. Create New Configuration Files:
+- Add special_terms_emergency.json
+  - Organize emergency terms by categories (cardiac, respiratory, etc.)
+  - Include all existing mappings with standardized structure
+- Add special_terms_treatment.json
+  - Organize treatment terms by categories (imaging, medications, etc.)
+  - Maintain all existing term variants
+2. Update Processing Scripts:
+- Modify 01_filter_emergency_opt.py:
+  - Load terms from JSON configuration
+  - Add term standardization
+  - Implement deduplication
+  - Preserve category information
+- Modify 02_filter_treatment_opt.py:
+  - Similar updates for treatment terms
+  - Maintain consistent processing logic
+3. New Features:
+- Term standardization: Convert variants to standard form
+- Deduplication: Remove repeated terms while preserving order
+- Category-aware: Support for term categorization
+- Improved maintainability: Configuration separated from code
+4. Technical Details:
+- Use pathlib for file path handling
+- JSON structure supports hierarchical organization
+- Maintain backward compatibility
+- Add type hints for better code clarity
+Testing:
+- Verify JSON format
+- Confirm all mappings migrated correctly
+- Check term standardization
+- Validate deduplication logic

dataset/scripts/compare_subsets_opt.py ADDED Viewed

	@@ -0,0 +1,124 @@

+# /scripts/compare_subsets_opt.py
+import pandas as pd
+from pathlib import Path
+from datetime import datetime
+def load_and_compare_subsets(format_type='csv'):
+    """
+    Load and compare the first 10 records from both optimized subsets
+    Args:
+        format_type (str): 'csv' or 'jsonl'
+    """
+    # Prepare output file
+    output_dir = Path("../analysis")
+    output_dir.mkdir(exist_ok=True)
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    output_file = output_dir / f"subset_comparison_first10_records_{timestamp}.md"
+    # Initialize markdown content
+    md_content = []
+    md_content.append("# Optimized Subsets Comparison Report\n")
+    md_content.append(f"Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
+    md_content.append(f"File format: {format_type.upper()}\n")
+    # Set file paths based on format
+    if format_type == 'csv':
+        emergency_path = "../dataset/emergency/emergency_subset_opt.csv"
+        treatment_path = "../dataset/emergency_treatment/emergency_treatment_subset_opt.csv"
+        # Load CSV files
+        emergency_df = pd.read_csv(emergency_path)
+        treatment_df = pd.read_csv(treatment_path)
+    else:  # jsonl
+        emergency_path = "../dataset/emergency/emergency_subset_opt.jsonl"
+        treatment_path = "../dataset/emergency_treatment/emergency_treatment_subset_opt.jsonl"
+        # Load JSONL files
+        emergency_df = pd.read_json(emergency_path, lines=True)
+        treatment_df = pd.read_json(treatment_path, lines=True)
+    # Print and save basic statistics
+    print("\n📊 Basic Statistics:")
+    print("-" * 40)
+    md_content.append("\n## Basic Statistics\n")
+    stats = [
+        f"- Emergency subset total records: {len(emergency_df)}",
+        f"- Emergency+Treatment subset total records: {len(treatment_df)}",
+        f"- Avg Emergency Text Length: {emergency_df['clean_text'].str.len().mean():.2f}",
+        f"- Avg Treatment Text Length: {treatment_df['clean_text'].str.len().mean():.2f}"
+    ]
+    # Calculate average keywords using pattern
+    pattern = r'\|'
+    emergency_avg = emergency_df['matched'].str.count(pattern).add(1).mean()
+    treatment_avg = treatment_df['matched'].str.count(pattern).add(1).mean()
+    stats.extend([
+        f"- Avg Emergency Keywords: {emergency_avg:.2f}",
+        f"- Avg Treatment Keywords: {treatment_avg:.2f}"
+    ])
+    # Print to console and add to markdown
+    for stat in stats:
+        print(stat.replace("- ", ""))
+    md_content.extend(stats)
+    # Compare first 10 records from Emergency subset
+    print("\n🔍 First 10 records from Emergency Subset:")
+    print("-" * 80)
+    md_content.append("\n## Emergency Subset (First 10 Records)\n")
+    for idx, row in emergency_df.head(10).iterrows():
+        print(f"\nRecord #{idx+1}")
+        print(f"Text preview: {row['clean_text'][:100]}...")
+        print(f"Matched keywords: {row['matched']}")
+        print(f"Text length: {len(row['clean_text'])}")
+        print("-" * 40)
+        md_content.extend([
+            f"\n### Record {idx+1}",
+            "```",
+            f"Text preview: {row['clean_text'][:100]}...",
+            f"Matched keywords: {row['matched']}",
+            f"Text length: {len(row['clean_text'])}",
+            "```\n"
+        ])
+    # Compare first 10 records from Emergency+Treatment subset
+    print("\n🔍 First 10 records from Emergency+Treatment Subset:")
+    print("-" * 80)
+    md_content.append("\n## Emergency+Treatment Subset (First 10 Records)\n")
+    for idx, row in treatment_df.head(10).iterrows():
+        print(f"\nRecord #{idx+1}")
+        print(f"Text preview: {row['clean_text'][:100]}...")
+        print(f"Emergency keywords: {row['matched']}")
+        print(f"Treatment keywords: {row['treatment_matched']}")
+        print(f"Text length: {len(row['clean_text'])}")
+        print("-" * 40)
+        md_content.extend([
+            f"\n### Record {idx+1}",
+            "```",
+            f"Text preview: {row['clean_text'][:100]}...",
+            f"Emergency keywords: {row['matched']}",
+            f"Treatment keywords: {row['treatment_matched']}",
+            f"Text length: {len(row['clean_text'])}",
+            "```\n"
+        ])
+    # Save markdown content
+    with open(output_file, 'w', encoding='utf-8') as f:
+        f.write('\n'.join(md_content))
+    print(f"\n✅ Comparison complete!")
+    print(f"Report saved to: {output_file}")
+if __name__ == "__main__":
+    # Compare using CSV format
+    print("\nComparing CSV files...")
+    load_and_compare_subsets('csv')
+    # Compare using JSONL format
+    print("\nComparing JSONL files...")
+    load_and_compare_subsets('jsonl')

dataset/scripts/data_explorer_opt.py ADDED Viewed

	@@ -0,0 +1,118 @@

+# /scripts/data_explorer_opt.py
+import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sns
+import numpy as np
+from pathlib import Path
+import json
+def analyze_subset(file_path, keywords_path, output_dir="analysis", subset_name="emergency"):
+    """Analyze subset data quality and distribution"""
+    print(f"\n{'='*50}")
+    print(f"Starting optimized dataset analysis: {file_path}")
+    print(f"Using keywords file: {keywords_path}")
+    print(f"Output directory: {output_dir}")
+    print(f"{'='*50}\n")
+    # Load data
+    print("1️⃣ Loading data...")
+    df = pd.read_csv(file_path)
+    output_dir = Path(output_dir)
+    # 1. Basic statistics
+    print("\n2️⃣ Calculating basic statistics...")
+    total = len(df)
+    df['text_length'] = df['clean_text'].str.len()
+    avg_len = df['text_length'].mean()
+    print(f"Total records: {total}")
+    print(f"Average text length: {avg_len:.2f}")
+    # Initialize statistics dictionary with native Python types
+    stats = {
+        'basic_statistics': {
+            'total_records': int(total),
+            'avg_length': float(avg_len)
+        },
+        'keyword_statistics': {}
+    }
+    # 2. Keyword analysis
+    print("\n3️⃣ Performing keyword analysis...")
+    with open(keywords_path, 'r') as f:
+        keywords = [line.strip() for line in f if line.strip()]
+    print(f"Loaded {len(keywords)} keywords")
+    # Count keywords and store in stats
+    for keyword in keywords:
+        cnt = df['clean_text'].str.contains(keyword, case=False).sum()
+        stats['keyword_statistics'][keyword] = int(cnt)
+        print(f"  - {keyword}: {cnt} records")
+    # 3. Visualization
+    print("\n4️⃣ Generating visualizations...")
+    output_path = Path(output_dir) / "plots"
+    output_path.mkdir(parents=True, exist_ok=True)
+    print(f"Charts will be saved in: {output_path}")
+    # 3.1 Keyword distribution chart
+    print("  - Generating keyword distribution chart...")
+    plt.figure(figsize=(15, 8))
+    plt.bar(stats['keyword_statistics'].keys(), stats['keyword_statistics'].values())
+    plt.xticks(rotation=45, ha='right')
+    plt.title(f'Keyword Distribution for {subset_name.capitalize()} Subset (Optimized)')
+    plt.xlabel('Keywords')
+    plt.ylabel('Match Count')
+    plt.savefig(output_path / f"keyword_distribution_{subset_name}_subset_opt.png", bbox_inches='tight')
+    plt.close()
+    # 3.2 Text length distribution
+    print("  - Generating text length distribution...")
+    plt.figure(figsize=(10, 6))
+    df['text_length'].hist(bins=50)
+    plt.title(f'Text Length Distribution ({subset_name.capitalize()} Subset - Optimized)')
+    plt.xlabel('Text Length')
+    plt.ylabel('Frequency')
+    plt.savefig(output_path / f"text_length_dist_{subset_name}_subset_opt.png", bbox_inches='tight')
+    plt.close()
+    # 3.3 Keyword co-occurrence analysis
+    print("  - Generating keyword co-occurrence heatmap...")
+    cooccurrence_matrix = np.zeros((len(keywords), len(keywords)))
+    for text in df['clean_text']:
+        present_keywords = [k for k in keywords if k.lower() in text.lower()]
+        for i, k1 in enumerate(present_keywords):
+            for j, k2 in enumerate(present_keywords):
+                if i != j:
+                    cooccurrence_matrix[keywords.index(k1)][keywords.index(k2)] += 1
+    plt.figure(figsize=(12, 8))
+    sns.heatmap(cooccurrence_matrix,
+                xticklabels=keywords,
+                yticklabels=keywords,
+                cmap='YlOrRd')
+    plt.title(f'Keyword Co-occurrence Heatmap ({subset_name.capitalize()} Subset - Optimized)')
+    plt.xticks(rotation=45, ha='right')
+    plt.tight_layout()
+    plt.savefig(output_path / f"keyword_cooccurrence_{subset_name}_subset_opt.png", bbox_inches='tight')
+    plt.close()
+    # 4. Save statistics
+    print("\n5️⃣ Saving statistics...")
+    stats_path = Path(output_dir) / "stats"
+    stats_path.mkdir(parents=True, exist_ok=True)
+    stats_file = stats_path / f"analysis_stats_{subset_name}_subset_opt.json"
+    with open(stats_file, 'w', encoding='utf-8') as f:
+        json.dump(stats, f, indent=2, ensure_ascii=False)
+    print(f"Statistics saved to: {stats_file}")
+    print(f"\n✅ Analysis complete! All results saved to {output_dir} directory")
+if __name__ == "__main__":
+    # Set file paths for optimized version
+    emergency_subset = "../dataset/emergency/emergency_subset_opt.csv"
+    emergency_keywords = "../keywords/emergency_keywords.txt"
+    output_dir = "../analysis"
+    # Run analysis
+    analyze_subset(emergency_subset, emergency_keywords, output_dir, "emergency")

dataset/scripts/data_explorer_treatment_opt.py ADDED Viewed

	@@ -0,0 +1,263 @@

+# /scripts/data_explorer_treatment_opt.py
+import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sns
+import numpy as np
+from pathlib import Path
+import json
+import numpy as np
+from tqdm import tqdm
+import re
+def calculate_density(matches, text_length):
+    """
+    Calculate keyword density per 1000 words
+    Args:
+        matches: Number of keyword matches
+        text_length: Total text length
+    Returns:
+        float: Density per 1000 words
+    """
+    return (matches / text_length) * 1000
+def analyze_treatment_subset(
+    treatment_file_path,
+    emergency_keywords_path,
+    treatment_keywords_path,
+    output_dir="analysis_treatment_opt"  # Updated default output directory
+):
+    """
+    Specialized analysis for optimized treatment subset focusing on:
+    1. Dual keyword analysis (emergency + treatment)
+    2. Path B effectiveness validation
+    3. Condition mapping data preparation
+    4. RAG readiness assessment
+    """
+    print(f"\n{'='*60}")
+    print(f"Treatment Subset Analysis (Optimized Version)")
+    print(f"Treatment file: {treatment_file_path}")
+    print(f"Emergency keywords: {emergency_keywords_path}")
+    print(f"Treatment keywords: {treatment_keywords_path}")
+    print(f"Output directory: {output_dir}")
+    print(f"{'='*60}\n")
+    # Load data
+    print("1️⃣ Loading optimized treatment subset data...")
+    df = pd.read_csv(treatment_file_path)
+    output_dir = Path(output_dir)
+    # Load keyword lists
+    print("2️⃣ Loading keyword lists...")
+    with open(emergency_keywords_path, 'r', encoding='utf-8') as f:
+        emergency_keywords = [line.strip() for line in f if line.strip()]
+    with open(treatment_keywords_path, 'r', encoding='utf-8') as f:
+        treatment_keywords = [line.strip() for line in f if line.strip()]
+    print(f"   Emergency keywords: {len(emergency_keywords)}")
+    print(f"   Treatment keywords: {len(treatment_keywords)}")
+    # Basic statistics
+    print("\n3️⃣ Computing basic statistics...")
+    total_records = len(df)
+    df['text_length'] = df['clean_text'].str.len()
+    avg_length = df['text_length'].mean()
+    print(f"   Total treatment records: {total_records}")
+    print(f"   Average text length: {avg_length:.2f} characters")
+    # Initialize comprehensive statistics
+    stats = {
+        'basic_statistics': {
+            'total_records': int(total_records),
+            'avg_text_length': float(avg_length),
+            'emergency_keywords_count': len(emergency_keywords),
+            'treatment_keywords_count': len(treatment_keywords)
+        },
+        'emergency_keyword_stats': {},
+        'treatment_keyword_stats': {},
+        'cooccurrence_analysis': {},
+        'path_b_validation': {},
+        'condition_mapping_candidates': {}
+    }
+    # Emergency keyword analysis in treatment subset
+    print("\n4️⃣ Analyzing emergency keywords in treatment subset...")
+    for keyword in emergency_keywords:
+        count = df['clean_text'].str.contains(keyword, case=False, na=False).sum()
+        stats['emergency_keyword_stats'][keyword] = int(count)
+        print(f"   Emergency: {keyword} -> {count} records")
+    # Treatment keyword analysis
+    print("\n5️⃣ Analyzing treatment keywords...")
+    for keyword in treatment_keywords:
+        count = df['clean_text'].str.contains(keyword, case=False, na=False).sum()
+        stats['treatment_keyword_stats'][keyword] = int(count)
+        print(f"   Treatment: {keyword} -> {count} records")
+    # Step 6: Co-occurrence analysis
+    print("\n6️⃣ Computing keyword co-occurrence patterns...")
+    # Initialize matrices for full dataset
+    emergency_matrix = np.zeros((len(df), len(emergency_keywords)), dtype=bool)
+    treatment_matrix = np.zeros((len(df), len(treatment_keywords)), dtype=bool)
+    # Pre-process text
+    print("   Pre-processing text...")
+    df['clean_text_lower'] = df['clean_text'].fillna('').str.lower()
+    # Process all emergency keywords
+    print("\n   Processing all emergency keywords...")
+    for i, keyword in enumerate(tqdm(emergency_keywords, desc="Emergency keywords")):
+        pattern = r'\b' + re.escape(keyword.lower()) + r'\b'
+        emergency_matrix[:, i] = df['clean_text_lower'].str.contains(pattern, regex=True, na=False)
+        matches = emergency_matrix[:, i].sum()
+        print(f"   - {keyword}: {matches} matches")
+    # Process all treatment keywords
+    print("\n   Processing all treatment keywords...")
+    for i, keyword in enumerate(tqdm(treatment_keywords, desc="Treatment keywords")):
+        pattern = r'\b' + re.escape(keyword.lower()) + r'\b'
+        treatment_matrix[:, i] = df['clean_text_lower'].str.contains(pattern, regex=True, na=False)
+        matches = treatment_matrix[:, i].sum()
+        print(f"   - {keyword}: {matches} matches")
+    # Compute co-occurrence matrix
+    print("\n   Computing co-occurrence matrix...")
+    cooc_matrix = emergency_matrix.astype(int).T @ treatment_matrix.astype(int)
+    print("   Computation completed successfully")
+    # Extract results
+    print("   Extracting co-occurrence pairs...")
+    cooccurrence_pairs = []
+    for i, em_kw in enumerate(emergency_keywords):
+        for j, tr_kw in enumerate(treatment_keywords):
+            count = int(cooc_matrix[i, j])
+            if count > 0:
+                cooccurrence_pairs.append({
+                    'emergency_keyword': em_kw,
+                    'treatment_keyword': tr_kw,
+                    'cooccurrence_count': count,
+                    'percentage': float(count / len(df) * 100)
+                })
+    # Sort and store results
+    cooccurrence_pairs.sort(key=lambda x: x['cooccurrence_count'], reverse=True)
+    stats['cooccurrence_analysis'] = cooccurrence_pairs[:20]  # Top 20 pairs
+    print(f"   Found {len(cooccurrence_pairs)} co-occurrence pairs")
+    print("   Top 5 co-occurrence pairs:")
+    for i, pair in enumerate(cooccurrence_pairs[:5]):
+        print(f"     {i+1}. {pair['emergency_keyword']} + {pair['treatment_keyword']}: {pair['cooccurrence_count']} ({pair['percentage']:.1f}%)")
+    # Step 7: Path B validation metrics
+    print("\n7️⃣ Validating Path B strategy effectiveness...")
+    # Compute keyword density with progress bar
+    print("   Computing keyword density...")
+    with tqdm(total=2, desc="Density calculation") as pbar:
+        emergency_density = calculate_density(
+            emergency_matrix.sum(axis=1),
+            df['text_length']
+        )
+        pbar.update(1)
+        treatment_density = calculate_density(
+            treatment_matrix.sum(axis=1),
+            df['text_length']
+        )
+        pbar.update(1)
+    # Store density in dataframe for visualization
+    df['emergency_keyword_density'] = emergency_density
+    df['treatment_keyword_density'] = treatment_density
+    # Calculate statistics with the new density metrics
+    stats['path_b_validation'] = {
+        'avg_emergency_density': float(np.mean(emergency_density)),
+        'avg_treatment_density': float(np.mean(treatment_density)),
+        'high_density_records': int(sum(
+            (emergency_density >= np.percentile(emergency_density, 75)) &
+            (treatment_density >= np.percentile(treatment_density, 75))
+        )),
+        'precision_estimate': float(sum(
+            (emergency_density > 0) & (treatment_density > 0)
+        ) / len(df))
+    }
+    # Print detailed results
+    print("\n   Results:")
+    print(f"   - Average emergency keyword density (per 1000 words): {stats['path_b_validation']['avg_emergency_density']:.2f}")
+    print(f"   - Average treatment keyword density (per 1000 words): {stats['path_b_validation']['avg_treatment_density']:.2f}")
+    print(f"   - High-density records (top 25% in both): {stats['path_b_validation']['high_density_records']}")
+    print(f"   - Precision estimate: {stats['path_b_validation']['precision_estimate']:.2f}")
+    # Sample distribution analysis
+    print("\n   Density Distribution:")
+    density_counts = pd.DataFrame({
+        'emergency': pd.qcut(emergency_density, q=4, labels=['Low', 'Medium-Low', 'Medium-High', 'High']),
+        'treatment': pd.qcut(treatment_density, q=4, labels=['Low', 'Medium-Low', 'Medium-High', 'High'])
+    }).value_counts().head()
+    print("   Top 5 density combinations (emergency, treatment):")
+    for (em, tr), count in density_counts.items():
+        print(f"   - {count} documents have {em} emergency and {tr} treatment density")
+    # Visualization
+    print("\n8️⃣ Generating visualizations...")
+    output_plots = output_dir / "plots"
+    output_plots.mkdir(parents=True, exist_ok=True)
+    # 1. Keyword density scatter plot with improved visualization
+    plt.figure(figsize=(12, 8))
+    plt.scatter(
+        emergency_density,
+        treatment_density,
+        alpha=0.6,
+        c=np.log1p(df['text_length']),
+        cmap='viridis'
+    )
+    plt.colorbar(label='Log Text Length')
+    plt.xlabel('Emergency Keyword Density (per 1000 words)')
+    plt.ylabel('Treatment Keyword Density (per 1000 words)')
+    plt.title('Emergency vs Treatment Keyword Density (Optimized)')
+    plt.grid(True, alpha=0.3)
+    # Add mean lines
+    plt.axvline(x=np.mean(emergency_density), color='r', linestyle='--', alpha=0.5, label='Mean Emergency Density')
+    plt.axhline(y=np.mean(treatment_density), color='g', linestyle='--', alpha=0.5, label='Mean Treatment Density')
+    plt.legend()
+    plt.savefig(output_plots / "keyword_density_scatter_opt.png", bbox_inches='tight', dpi=300)
+    plt.close()
+    # Save comprehensive statistics
+    print("\n9️⃣ Saving analysis results...")
+    stats_dir = output_dir / "stats"
+    stats_dir.mkdir(parents=True, exist_ok=True)
+    with open(stats_dir / "treatment_analysis_comprehensive_opt.json", 'w', encoding='utf-8') as f:
+        json.dump(stats, f, indent=2, ensure_ascii=False)
+    print(f"✅ Treatment subset analysis complete! (Optimized Version)")
+    print(f"   Results saved to: {output_dir}")
+    print(f"   Plots: {output_plots}")
+    print(f"   Statistics: {stats_dir}")
+    return stats
+if __name__ == "__main__":
+    # Configuration for optimized version
+    treatment_file = "../dataset/emergency_treatment/emergency_treatment_subset_opt.csv"
+    emergency_keywords = "../keywords/emergency_keywords.txt"
+    treatment_keywords = "../keywords/treatment_keywords.txt"
+    output_directory = "../analysis_treatment_opt"
+    # Run analysis
+    results = analyze_treatment_subset(
+        treatment_file,
+        emergency_keywords,
+        treatment_keywords,
+        output_directory
+    )