YanBoChen commited on
Commit
8de0937
·
1 Parent(s): 9829a46

refactor: migrate special terms to JSON configuration

Browse files

BREAKING CHANGE: Move hardcoded special terms mapping to external JSON files

1. Create New Configuration Files:
- Add special_terms_emergency.json
- Organize emergency terms by categories (cardiac, respiratory, etc.)
- Include all existing mappings with standardized structure
- Add special_terms_treatment.json
- Organize treatment terms by categories (imaging, medications, etc.)
- Maintain all existing term variants

2. Update Processing Scripts:
- Modify 01_filter_emergency_opt.py:
- Load terms from JSON configuration
- Add term standardization
- Implement deduplication
- Preserve category information
- Modify 02_filter_treatment_opt.py:
- Similar updates for treatment terms
- Maintain consistent processing logic

3. New Features:
- Term standardization: Convert variants to standard form
- Deduplication: Remove repeated terms while preserving order
- Category-aware: Support for term categorization
- Improved maintainability: Configuration separated from code

4. Technical Details:
- Use pathlib for file path handling
- JSON structure supports hierarchical organization
- Maintain backward compatibility
- Add type hints for better code clarity

Testing:
- Verify JSON format
- Confirm all mappings migrated correctly
- Check term standardization
- Validate deduplication logic

dataset/keywords/special_terms_emergency.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cardiac": {
3
+ "mi": ["mi", "m.i.", "myocardial infarction", "MI"],
4
+ "acs": ["acs", "ACS", "acute coronary syndrome"]
5
+ },
6
+ "respiratory": {
7
+ "ards": ["ards", "ARDS", "acute respiratory distress syndrome"],
8
+ "respiratory_failure": ["respiratory failure", "resp failure", "RF"]
9
+ },
10
+ "neurological": {
11
+ "loc": ["loc", "LOC", "loss of consciousness"],
12
+ "cva": ["cva", "CVA", "stroke", "cerebrovascular accident"]
13
+ },
14
+ "shock": {
15
+ "shock": ["shock", "circulatory failure"],
16
+ "septic_shock": ["septic shock", "sepsis induced shock"]
17
+ },
18
+ "bleeding": {
19
+ "gi_bleed": ["gi bleed", "gi bleeding", "gastrointestinal hemorrhage", "GI hemorrhage"],
20
+ "hemorrhage": ["hemorrhage", "bleeding", "blood loss"]
21
+ },
22
+ "vital_signs": {
23
+ "hypotension": ["hypotension", "low bp", "low blood pressure"],
24
+ "tachycardia": ["tachycardia", "elevated heart rate", "fast heart rate"]
25
+ }
26
+ }
dataset/keywords/special_terms_treatment.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "imaging": {
3
+ "x-ray": ["x-ray", "x ray", "xray", "XR"],
4
+ "ct": ["ct", "ct-scan", "cat scan", "computed tomography"],
5
+ "us": ["us", "u/s", "ultrasound", "sonography"]
6
+ },
7
+ "medications": {
8
+ "iv": ["iv", "i.v.", "intravenous"],
9
+ "im": ["im", "i.m.", "intramuscular"],
10
+ "po": ["po", "p.o.", "per os", "by mouth"]
11
+ },
12
+ "procedures": {
13
+ "cpr": ["cpr", "CPR", "cardiopulmonary resuscitation"],
14
+ "intubation": ["intubation", "ETT", "endotracheal tube"],
15
+ "cardioversion": ["cardioversion", "electrical cardioversion"]
16
+ },
17
+ "monitoring": {
18
+ "ecg": ["ecg", "ekg", "electrocardiogram"],
19
+ "monitoring": ["monitoring", "continuous observation"]
20
+ },
21
+ "ventilation": {
22
+ "bipap": ["bipap", "BiPAP", "bi-level positive airway pressure"],
23
+ "cpap": ["cpap", "CPAP", "continuous positive airway pressure"]
24
+ }
25
+ }
dataset/scripts/01_filter_emergency_opt.py CHANGED
@@ -1,36 +1,20 @@
1
  import os
2
  import re
 
3
  import pandas as pd
 
4
 
5
- # Medical term processor class for handling special terms
6
  class MedicalTermProcessor:
7
  def __init__(self):
8
- # Emergency special terms mapping
9
- self.emergency_special_terms = {
10
- # Cardiac
11
- 'mi': ['mi', 'm.i.', 'myocardial infarction', 'MI'],
12
- 'acs': ['acs', 'ACS', 'acute coronary syndrome'],
13
 
14
- # Respiratory
15
- 'ards': ['ards', 'ARDS', 'acute respiratory distress syndrome'],
16
- 'respiratory_failure': ['respiratory failure', 'resp failure', 'RF'],
17
-
18
- # Neurological
19
- 'loc': ['loc', 'LOC', 'loss of consciousness'],
20
- 'cva': ['cva', 'CVA', 'stroke', 'cerebrovascular accident'],
21
-
22
- # Shock States
23
- 'shock': ['shock', 'circulatory failure'],
24
- 'septic_shock': ['septic shock', 'sepsis induced shock'],
25
-
26
- # Bleeding
27
- 'gi_bleed': ['gi bleed', 'gi bleeding', 'gastrointestinal hemorrhage', 'GI hemorrhage'],
28
- 'hemorrhage': ['hemorrhage', 'bleeding', 'blood loss'],
29
-
30
- # Vital Signs
31
- 'hypotension': ['hypotension', 'low bp', 'low blood pressure'],
32
- 'tachycardia': ['tachycardia', 'elevated heart rate', 'fast heart rate']
33
- }
34
 
35
  def get_all_variants(self):
36
  """Get all term variants including special terms"""
@@ -39,6 +23,32 @@ class MedicalTermProcessor:
39
  variants.extend(term_list)
40
  return variants
41
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  # Function: Load keywords and print progress
43
  def load_keywords(path, processor):
44
  print(f"📥 Loading keywords from: {path}")
@@ -70,7 +80,7 @@ df["matched"] = (
70
  df["clean_text"]
71
  .fillna("") # Convert NaN to empty string
72
  .str.findall(pattern, flags=re.IGNORECASE)
73
- .apply(lambda lst: "|".join(lst) if lst else "")
74
  )
75
  df["has_emergency"] = df["matched"].str.len() > 0
76
 
 
1
  import os
2
  import re
3
+ import json
4
  import pandas as pd
5
+ from pathlib import Path
6
 
 
7
  class MedicalTermProcessor:
8
  def __init__(self):
9
+ # Load emergency special terms from JSON
10
+ keywords_dir = Path("../keywords")
11
+ with open(keywords_dir / "special_terms_emergency.json", "r") as f:
12
+ self.emergency_terms_by_category = json.load(f)
 
13
 
14
+ # Flatten the nested structure for easy lookup
15
+ self.emergency_special_terms = {}
16
+ for category in self.emergency_terms_by_category.values():
17
+ self.emergency_special_terms.update(category)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
 
19
  def get_all_variants(self):
20
  """Get all term variants including special terms"""
 
23
  variants.extend(term_list)
24
  return variants
25
 
26
+ def standardize_term(self, term: str) -> str:
27
+ """Convert a term to its standard form if it's a variant"""
28
+ term_lower = term.lower()
29
+ for standard_term, variants in self.emergency_special_terms.items():
30
+ if term_lower in [v.lower() for v in variants]:
31
+ return standard_term
32
+ return term
33
+
34
+ def process_matches(self, matches: list) -> str:
35
+ """Process matches to standardize terms and remove duplicates"""
36
+ if not matches:
37
+ return ""
38
+
39
+ # Standardize terms
40
+ standardized = [self.standardize_term(match) for match in matches]
41
+
42
+ # Remove duplicates while preserving order
43
+ seen = set()
44
+ unique_matches = []
45
+ for term in standardized:
46
+ if term.lower() not in seen:
47
+ unique_matches.append(term)
48
+ seen.add(term.lower())
49
+
50
+ return "|".join(unique_matches)
51
+
52
  # Function: Load keywords and print progress
53
  def load_keywords(path, processor):
54
  print(f"📥 Loading keywords from: {path}")
 
80
  df["clean_text"]
81
  .fillna("") # Convert NaN to empty string
82
  .str.findall(pattern, flags=re.IGNORECASE)
83
+ .apply(lambda matches: processor.process_matches(matches)) # Use new process_matches method
84
  )
85
  df["has_emergency"] = df["matched"].str.len() > 0
86
 
dataset/scripts/02_filter_treatment_opt.py ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import json
4
+ import pandas as pd
5
+ from pathlib import Path
6
+
7
+ class MedicalTermProcessor:
8
+ def __init__(self):
9
+ # Load treatment special terms from JSON
10
+ keywords_dir = Path("../keywords")
11
+ with open(keywords_dir / "special_terms_treatment.json", "r") as f:
12
+ self.treatment_terms_by_category = json.load(f)
13
+
14
+ # Flatten the nested structure for easy lookup
15
+ self.treatment_special_terms = {}
16
+ for category in self.treatment_terms_by_category.values():
17
+ self.treatment_special_terms.update(category)
18
+
19
+ def get_all_variants(self):
20
+ """Get all term variants including special terms"""
21
+ variants = []
22
+ for term_list in self.treatment_special_terms.values():
23
+ variants.extend(term_list)
24
+ return variants
25
+
26
+ def standardize_term(self, term: str) -> str:
27
+ """Convert a term to its standard form if it's a variant"""
28
+ term_lower = term.lower()
29
+ for standard_term, variants in self.treatment_special_terms.items():
30
+ if term_lower in [v.lower() for v in variants]:
31
+ return standard_term
32
+ return term
33
+
34
+ def process_matches(self, matches: list) -> str:
35
+ """Process matches to standardize terms and remove duplicates"""
36
+ if not matches:
37
+ return ""
38
+
39
+ # Standardize terms
40
+ standardized = [self.standardize_term(match) for match in matches]
41
+
42
+ # Remove duplicates while preserving order
43
+ seen = set()
44
+ unique_matches = []
45
+ for term in standardized:
46
+ if term.lower() not in seen:
47
+ unique_matches.append(term)
48
+ seen.add(term.lower())
49
+
50
+ return "|".join(unique_matches)
51
+
52
+ def load_keywords(path, processor):
53
+ """Load and preprocess treatment keywords"""
54
+ print(f"📥 Loading keywords from: {path}")
55
+
56
+ # Load basic keywords
57
+ with open(path, "r", encoding="utf-8") as f:
58
+ basic_kws = [line.strip() for line in f if line.strip()]
59
+
60
+ # Add special term variants
61
+ special_kws = processor.get_all_variants()
62
+ all_kws = list(set(basic_kws + special_kws)) # Remove duplicates
63
+
64
+ print(f" Loaded {len(all_kws)} keywords (including variants)")
65
+ return all_kws
66
+
67
+ # Step 1: Read optimized emergency subset
68
+ print("1️⃣ Reading optimized emergency subset...")
69
+ emergency_path = "../dataset/emergency/emergency_subset_opt.jsonl"
70
+ df = pd.read_json(emergency_path, lines=True)
71
+ print(f" Loaded {len(df)} emergency records")
72
+ print(f" Contains emergency keywords in 'matched' column")
73
+
74
+ # Step 2: Load treatment keywords and match
75
+ print("2️⃣ Loading treatment keywords and matching...")
76
+ processor = MedicalTermProcessor()
77
+ keywords = load_keywords("../keywords/treatment_keywords.txt", processor)
78
+ pattern = r"\b(?:" + "|".join(map(re.escape, keywords)) + r")\b"
79
+
80
+ # Step 3: Process text and match keywords
81
+ print("3️⃣ Processing text and matching keywords...")
82
+ # Match treatment keywords and add metadata columns
83
+ df["treatment_matched"] = (
84
+ df["clean_text"]
85
+ .fillna("") # Convert NaN to empty string
86
+ .str.findall(pattern, flags=re.IGNORECASE)
87
+ .apply(lambda matches: processor.process_matches(matches)) # Use new process_matches method
88
+ )
89
+ df["has_treatment"] = df["treatment_matched"].str.len() > 0
90
+
91
+ # Add metadata columns for future use
92
+ df["type"] = "treatment" # Document type identifier
93
+ df["condition"] = "" # Reserved for future condition mapping
94
+
95
+ # Verify columns
96
+ print(" Verifying columns...")
97
+ print(f" - Emergency keywords column (matched): {df['matched'].notna().sum()} records")
98
+ print(f" - Treatment keywords column (treatment_matched): {df['treatment_matched'].notna().sum()} records")
99
+
100
+ # Calculate statistics
101
+ cnt_treat = df["has_treatment"].sum()
102
+ avg_matches = (
103
+ df[df["has_treatment"]]["treatment_matched"]
104
+ .str.count(r"\|")
105
+ .add(1)
106
+ .mean()
107
+ )
108
+
109
+ print(f" Found {cnt_treat} treatment-related records")
110
+ print(f" Average treatment keywords per record: {avg_matches:.2f}")
111
+
112
+ # Step 4: Save treatment subset
113
+ print("4️⃣ Saving treatment subset...")
114
+ out_dir = "../dataset/emergency_treatment"
115
+ os.makedirs(out_dir, exist_ok=True)
116
+
117
+ # Select records with treatment keywords
118
+ subset = df[df["has_treatment"]].copy() # Use copy to avoid SettingWithCopyWarning
119
+
120
+ # Verify final subset columns
121
+ print(" Final subset columns:")
122
+ print(f" - Emergency keywords (matched): {subset['matched'].notna().sum()} records")
123
+ print(f" - Treatment keywords (treatment_matched): {subset['treatment_matched'].notna().sum()} records")
124
+
125
+ # Save with _opt suffix
126
+ subset.to_json(f"{out_dir}/emergency_treatment_subset_opt.jsonl", orient="records", lines=True)
127
+ subset.to_csv(f"{out_dir}/emergency_treatment_subset_opt.csv", index=False)
128
+
129
+ print(f"✅ Generated optimized treatment subset with {len(subset)} records")
130
+ print(f" Saved in: {out_dir}")
131
+ print(f" Contains both emergency and treatment keywords")
dataset/scripts/commit_message_20250726_special_terms.txt ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ refactor: migrate special terms to JSON configuration
2
+
3
+ BREAKING CHANGE: Move hardcoded special terms mapping to external JSON files
4
+
5
+ 1. Create New Configuration Files:
6
+ - Add special_terms_emergency.json
7
+ - Organize emergency terms by categories (cardiac, respiratory, etc.)
8
+ - Include all existing mappings with standardized structure
9
+ - Add special_terms_treatment.json
10
+ - Organize treatment terms by categories (imaging, medications, etc.)
11
+ - Maintain all existing term variants
12
+
13
+ 2. Update Processing Scripts:
14
+ - Modify 01_filter_emergency_opt.py:
15
+ - Load terms from JSON configuration
16
+ - Add term standardization
17
+ - Implement deduplication
18
+ - Preserve category information
19
+ - Modify 02_filter_treatment_opt.py:
20
+ - Similar updates for treatment terms
21
+ - Maintain consistent processing logic
22
+
23
+ 3. New Features:
24
+ - Term standardization: Convert variants to standard form
25
+ - Deduplication: Remove repeated terms while preserving order
26
+ - Category-aware: Support for term categorization
27
+ - Improved maintainability: Configuration separated from code
28
+
29
+ 4. Technical Details:
30
+ - Use pathlib for file path handling
31
+ - JSON structure supports hierarchical organization
32
+ - Maintain backward compatibility
33
+ - Add type hints for better code clarity
34
+
35
+ Testing:
36
+ - Verify JSON format
37
+ - Confirm all mappings migrated correctly
38
+ - Check term standardization
39
+ - Validate deduplication logic
dataset/scripts/compare_subsets_opt.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # /scripts/compare_subsets_opt.py
2
+ import pandas as pd
3
+ from pathlib import Path
4
+ from datetime import datetime
5
+
6
+ def load_and_compare_subsets(format_type='csv'):
7
+ """
8
+ Load and compare the first 10 records from both optimized subsets
9
+
10
+ Args:
11
+ format_type (str): 'csv' or 'jsonl'
12
+ """
13
+ # Prepare output file
14
+ output_dir = Path("../analysis")
15
+ output_dir.mkdir(exist_ok=True)
16
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
17
+ output_file = output_dir / f"subset_comparison_first10_records_{timestamp}.md"
18
+
19
+ # Initialize markdown content
20
+ md_content = []
21
+ md_content.append("# Optimized Subsets Comparison Report\n")
22
+ md_content.append(f"Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
23
+ md_content.append(f"File format: {format_type.upper()}\n")
24
+
25
+ # Set file paths based on format
26
+ if format_type == 'csv':
27
+ emergency_path = "../dataset/emergency/emergency_subset_opt.csv"
28
+ treatment_path = "../dataset/emergency_treatment/emergency_treatment_subset_opt.csv"
29
+ # Load CSV files
30
+ emergency_df = pd.read_csv(emergency_path)
31
+ treatment_df = pd.read_csv(treatment_path)
32
+ else: # jsonl
33
+ emergency_path = "../dataset/emergency/emergency_subset_opt.jsonl"
34
+ treatment_path = "../dataset/emergency_treatment/emergency_treatment_subset_opt.jsonl"
35
+ # Load JSONL files
36
+ emergency_df = pd.read_json(emergency_path, lines=True)
37
+ treatment_df = pd.read_json(treatment_path, lines=True)
38
+
39
+ # Print and save basic statistics
40
+ print("\n📊 Basic Statistics:")
41
+ print("-" * 40)
42
+ md_content.append("\n## Basic Statistics\n")
43
+
44
+ stats = [
45
+ f"- Emergency subset total records: {len(emergency_df)}",
46
+ f"- Emergency+Treatment subset total records: {len(treatment_df)}",
47
+ f"- Avg Emergency Text Length: {emergency_df['clean_text'].str.len().mean():.2f}",
48
+ f"- Avg Treatment Text Length: {treatment_df['clean_text'].str.len().mean():.2f}"
49
+ ]
50
+
51
+ # Calculate average keywords using pattern
52
+ pattern = r'\|'
53
+ emergency_avg = emergency_df['matched'].str.count(pattern).add(1).mean()
54
+ treatment_avg = treatment_df['matched'].str.count(pattern).add(1).mean()
55
+
56
+ stats.extend([
57
+ f"- Avg Emergency Keywords: {emergency_avg:.2f}",
58
+ f"- Avg Treatment Keywords: {treatment_avg:.2f}"
59
+ ])
60
+
61
+ # Print to console and add to markdown
62
+ for stat in stats:
63
+ print(stat.replace("- ", ""))
64
+ md_content.extend(stats)
65
+
66
+ # Compare first 10 records from Emergency subset
67
+ print("\n🔍 First 10 records from Emergency Subset:")
68
+ print("-" * 80)
69
+ md_content.append("\n## Emergency Subset (First 10 Records)\n")
70
+
71
+ for idx, row in emergency_df.head(10).iterrows():
72
+ print(f"\nRecord #{idx+1}")
73
+ print(f"Text preview: {row['clean_text'][:100]}...")
74
+ print(f"Matched keywords: {row['matched']}")
75
+ print(f"Text length: {len(row['clean_text'])}")
76
+ print("-" * 40)
77
+
78
+ md_content.extend([
79
+ f"\n### Record {idx+1}",
80
+ "```",
81
+ f"Text preview: {row['clean_text'][:100]}...",
82
+ f"Matched keywords: {row['matched']}",
83
+ f"Text length: {len(row['clean_text'])}",
84
+ "```\n"
85
+ ])
86
+
87
+ # Compare first 10 records from Emergency+Treatment subset
88
+ print("\n🔍 First 10 records from Emergency+Treatment Subset:")
89
+ print("-" * 80)
90
+ md_content.append("\n## Emergency+Treatment Subset (First 10 Records)\n")
91
+
92
+ for idx, row in treatment_df.head(10).iterrows():
93
+ print(f"\nRecord #{idx+1}")
94
+ print(f"Text preview: {row['clean_text'][:100]}...")
95
+ print(f"Emergency keywords: {row['matched']}")
96
+ print(f"Treatment keywords: {row['treatment_matched']}")
97
+ print(f"Text length: {len(row['clean_text'])}")
98
+ print("-" * 40)
99
+
100
+ md_content.extend([
101
+ f"\n### Record {idx+1}",
102
+ "```",
103
+ f"Text preview: {row['clean_text'][:100]}...",
104
+ f"Emergency keywords: {row['matched']}",
105
+ f"Treatment keywords: {row['treatment_matched']}",
106
+ f"Text length: {len(row['clean_text'])}",
107
+ "```\n"
108
+ ])
109
+
110
+ # Save markdown content
111
+ with open(output_file, 'w', encoding='utf-8') as f:
112
+ f.write('\n'.join(md_content))
113
+
114
+ print(f"\n✅ Comparison complete!")
115
+ print(f"Report saved to: {output_file}")
116
+
117
+ if __name__ == "__main__":
118
+ # Compare using CSV format
119
+ print("\nComparing CSV files...")
120
+ load_and_compare_subsets('csv')
121
+
122
+ # Compare using JSONL format
123
+ print("\nComparing JSONL files...")
124
+ load_and_compare_subsets('jsonl')
dataset/scripts/data_explorer_opt.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # /scripts/data_explorer_opt.py
2
+ import pandas as pd
3
+ import matplotlib.pyplot as plt
4
+ import seaborn as sns
5
+ import numpy as np
6
+ from pathlib import Path
7
+ import json
8
+
9
+ def analyze_subset(file_path, keywords_path, output_dir="analysis", subset_name="emergency"):
10
+ """Analyze subset data quality and distribution"""
11
+ print(f"\n{'='*50}")
12
+ print(f"Starting optimized dataset analysis: {file_path}")
13
+ print(f"Using keywords file: {keywords_path}")
14
+ print(f"Output directory: {output_dir}")
15
+ print(f"{'='*50}\n")
16
+
17
+ # Load data
18
+ print("1️⃣ Loading data...")
19
+ df = pd.read_csv(file_path)
20
+ output_dir = Path(output_dir)
21
+
22
+ # 1. Basic statistics
23
+ print("\n2️⃣ Calculating basic statistics...")
24
+ total = len(df)
25
+ df['text_length'] = df['clean_text'].str.len()
26
+ avg_len = df['text_length'].mean()
27
+ print(f"Total records: {total}")
28
+ print(f"Average text length: {avg_len:.2f}")
29
+
30
+ # Initialize statistics dictionary with native Python types
31
+ stats = {
32
+ 'basic_statistics': {
33
+ 'total_records': int(total),
34
+ 'avg_length': float(avg_len)
35
+ },
36
+ 'keyword_statistics': {}
37
+ }
38
+
39
+ # 2. Keyword analysis
40
+ print("\n3️⃣ Performing keyword analysis...")
41
+ with open(keywords_path, 'r') as f:
42
+ keywords = [line.strip() for line in f if line.strip()]
43
+ print(f"Loaded {len(keywords)} keywords")
44
+
45
+ # Count keywords and store in stats
46
+ for keyword in keywords:
47
+ cnt = df['clean_text'].str.contains(keyword, case=False).sum()
48
+ stats['keyword_statistics'][keyword] = int(cnt)
49
+ print(f" - {keyword}: {cnt} records")
50
+
51
+ # 3. Visualization
52
+ print("\n4️⃣ Generating visualizations...")
53
+ output_path = Path(output_dir) / "plots"
54
+ output_path.mkdir(parents=True, exist_ok=True)
55
+ print(f"Charts will be saved in: {output_path}")
56
+
57
+ # 3.1 Keyword distribution chart
58
+ print(" - Generating keyword distribution chart...")
59
+ plt.figure(figsize=(15, 8))
60
+ plt.bar(stats['keyword_statistics'].keys(), stats['keyword_statistics'].values())
61
+ plt.xticks(rotation=45, ha='right')
62
+ plt.title(f'Keyword Distribution for {subset_name.capitalize()} Subset (Optimized)')
63
+ plt.xlabel('Keywords')
64
+ plt.ylabel('Match Count')
65
+ plt.savefig(output_path / f"keyword_distribution_{subset_name}_subset_opt.png", bbox_inches='tight')
66
+ plt.close()
67
+
68
+ # 3.2 Text length distribution
69
+ print(" - Generating text length distribution...")
70
+ plt.figure(figsize=(10, 6))
71
+ df['text_length'].hist(bins=50)
72
+ plt.title(f'Text Length Distribution ({subset_name.capitalize()} Subset - Optimized)')
73
+ plt.xlabel('Text Length')
74
+ plt.ylabel('Frequency')
75
+ plt.savefig(output_path / f"text_length_dist_{subset_name}_subset_opt.png", bbox_inches='tight')
76
+ plt.close()
77
+
78
+ # 3.3 Keyword co-occurrence analysis
79
+ print(" - Generating keyword co-occurrence heatmap...")
80
+ cooccurrence_matrix = np.zeros((len(keywords), len(keywords)))
81
+ for text in df['clean_text']:
82
+ present_keywords = [k for k in keywords if k.lower() in text.lower()]
83
+ for i, k1 in enumerate(present_keywords):
84
+ for j, k2 in enumerate(present_keywords):
85
+ if i != j:
86
+ cooccurrence_matrix[keywords.index(k1)][keywords.index(k2)] += 1
87
+
88
+ plt.figure(figsize=(12, 8))
89
+ sns.heatmap(cooccurrence_matrix,
90
+ xticklabels=keywords,
91
+ yticklabels=keywords,
92
+ cmap='YlOrRd')
93
+ plt.title(f'Keyword Co-occurrence Heatmap ({subset_name.capitalize()} Subset - Optimized)')
94
+ plt.xticks(rotation=45, ha='right')
95
+ plt.tight_layout()
96
+ plt.savefig(output_path / f"keyword_cooccurrence_{subset_name}_subset_opt.png", bbox_inches='tight')
97
+ plt.close()
98
+
99
+ # 4. Save statistics
100
+ print("\n5️⃣ Saving statistics...")
101
+ stats_path = Path(output_dir) / "stats"
102
+ stats_path.mkdir(parents=True, exist_ok=True)
103
+ stats_file = stats_path / f"analysis_stats_{subset_name}_subset_opt.json"
104
+
105
+ with open(stats_file, 'w', encoding='utf-8') as f:
106
+ json.dump(stats, f, indent=2, ensure_ascii=False)
107
+ print(f"Statistics saved to: {stats_file}")
108
+
109
+ print(f"\n✅ Analysis complete! All results saved to {output_dir} directory")
110
+
111
+ if __name__ == "__main__":
112
+ # Set file paths for optimized version
113
+ emergency_subset = "../dataset/emergency/emergency_subset_opt.csv"
114
+ emergency_keywords = "../keywords/emergency_keywords.txt"
115
+ output_dir = "../analysis"
116
+
117
+ # Run analysis
118
+ analyze_subset(emergency_subset, emergency_keywords, output_dir, "emergency")
dataset/scripts/data_explorer_treatment_opt.py ADDED
@@ -0,0 +1,263 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # /scripts/data_explorer_treatment_opt.py
2
+ import pandas as pd
3
+ import matplotlib.pyplot as plt
4
+ import seaborn as sns
5
+ import numpy as np
6
+ from pathlib import Path
7
+ import json
8
+ import numpy as np
9
+ from tqdm import tqdm
10
+ import re
11
+
12
+ def calculate_density(matches, text_length):
13
+ """
14
+ Calculate keyword density per 1000 words
15
+
16
+ Args:
17
+ matches: Number of keyword matches
18
+ text_length: Total text length
19
+
20
+ Returns:
21
+ float: Density per 1000 words
22
+ """
23
+ return (matches / text_length) * 1000
24
+
25
+ def analyze_treatment_subset(
26
+ treatment_file_path,
27
+ emergency_keywords_path,
28
+ treatment_keywords_path,
29
+ output_dir="analysis_treatment_opt" # Updated default output directory
30
+ ):
31
+ """
32
+ Specialized analysis for optimized treatment subset focusing on:
33
+ 1. Dual keyword analysis (emergency + treatment)
34
+ 2. Path B effectiveness validation
35
+ 3. Condition mapping data preparation
36
+ 4. RAG readiness assessment
37
+ """
38
+ print(f"\n{'='*60}")
39
+ print(f"Treatment Subset Analysis (Optimized Version)")
40
+ print(f"Treatment file: {treatment_file_path}")
41
+ print(f"Emergency keywords: {emergency_keywords_path}")
42
+ print(f"Treatment keywords: {treatment_keywords_path}")
43
+ print(f"Output directory: {output_dir}")
44
+ print(f"{'='*60}\n")
45
+
46
+ # Load data
47
+ print("1️⃣ Loading optimized treatment subset data...")
48
+ df = pd.read_csv(treatment_file_path)
49
+ output_dir = Path(output_dir)
50
+
51
+ # Load keyword lists
52
+ print("2️⃣ Loading keyword lists...")
53
+ with open(emergency_keywords_path, 'r', encoding='utf-8') as f:
54
+ emergency_keywords = [line.strip() for line in f if line.strip()]
55
+
56
+ with open(treatment_keywords_path, 'r', encoding='utf-8') as f:
57
+ treatment_keywords = [line.strip() for line in f if line.strip()]
58
+
59
+ print(f" Emergency keywords: {len(emergency_keywords)}")
60
+ print(f" Treatment keywords: {len(treatment_keywords)}")
61
+
62
+ # Basic statistics
63
+ print("\n3️⃣ Computing basic statistics...")
64
+ total_records = len(df)
65
+ df['text_length'] = df['clean_text'].str.len()
66
+ avg_length = df['text_length'].mean()
67
+
68
+ print(f" Total treatment records: {total_records}")
69
+ print(f" Average text length: {avg_length:.2f} characters")
70
+
71
+ # Initialize comprehensive statistics
72
+ stats = {
73
+ 'basic_statistics': {
74
+ 'total_records': int(total_records),
75
+ 'avg_text_length': float(avg_length),
76
+ 'emergency_keywords_count': len(emergency_keywords),
77
+ 'treatment_keywords_count': len(treatment_keywords)
78
+ },
79
+ 'emergency_keyword_stats': {},
80
+ 'treatment_keyword_stats': {},
81
+ 'cooccurrence_analysis': {},
82
+ 'path_b_validation': {},
83
+ 'condition_mapping_candidates': {}
84
+ }
85
+
86
+ # Emergency keyword analysis in treatment subset
87
+ print("\n4️⃣ Analyzing emergency keywords in treatment subset...")
88
+ for keyword in emergency_keywords:
89
+ count = df['clean_text'].str.contains(keyword, case=False, na=False).sum()
90
+ stats['emergency_keyword_stats'][keyword] = int(count)
91
+ print(f" Emergency: {keyword} -> {count} records")
92
+
93
+ # Treatment keyword analysis
94
+ print("\n5️⃣ Analyzing treatment keywords...")
95
+ for keyword in treatment_keywords:
96
+ count = df['clean_text'].str.contains(keyword, case=False, na=False).sum()
97
+ stats['treatment_keyword_stats'][keyword] = int(count)
98
+ print(f" Treatment: {keyword} -> {count} records")
99
+
100
+ # Step 6: Co-occurrence analysis
101
+ print("\n6️⃣ Computing keyword co-occurrence patterns...")
102
+
103
+ # Initialize matrices for full dataset
104
+ emergency_matrix = np.zeros((len(df), len(emergency_keywords)), dtype=bool)
105
+ treatment_matrix = np.zeros((len(df), len(treatment_keywords)), dtype=bool)
106
+
107
+ # Pre-process text
108
+ print(" Pre-processing text...")
109
+ df['clean_text_lower'] = df['clean_text'].fillna('').str.lower()
110
+
111
+ # Process all emergency keywords
112
+ print("\n Processing all emergency keywords...")
113
+ for i, keyword in enumerate(tqdm(emergency_keywords, desc="Emergency keywords")):
114
+ pattern = r'\b' + re.escape(keyword.lower()) + r'\b'
115
+ emergency_matrix[:, i] = df['clean_text_lower'].str.contains(pattern, regex=True, na=False)
116
+ matches = emergency_matrix[:, i].sum()
117
+ print(f" - {keyword}: {matches} matches")
118
+
119
+ # Process all treatment keywords
120
+ print("\n Processing all treatment keywords...")
121
+ for i, keyword in enumerate(tqdm(treatment_keywords, desc="Treatment keywords")):
122
+ pattern = r'\b' + re.escape(keyword.lower()) + r'\b'
123
+ treatment_matrix[:, i] = df['clean_text_lower'].str.contains(pattern, regex=True, na=False)
124
+ matches = treatment_matrix[:, i].sum()
125
+ print(f" - {keyword}: {matches} matches")
126
+
127
+ # Compute co-occurrence matrix
128
+ print("\n Computing co-occurrence matrix...")
129
+ cooc_matrix = emergency_matrix.astype(int).T @ treatment_matrix.astype(int)
130
+ print(" Computation completed successfully")
131
+
132
+ # Extract results
133
+ print(" Extracting co-occurrence pairs...")
134
+ cooccurrence_pairs = []
135
+ for i, em_kw in enumerate(emergency_keywords):
136
+ for j, tr_kw in enumerate(treatment_keywords):
137
+ count = int(cooc_matrix[i, j])
138
+ if count > 0:
139
+ cooccurrence_pairs.append({
140
+ 'emergency_keyword': em_kw,
141
+ 'treatment_keyword': tr_kw,
142
+ 'cooccurrence_count': count,
143
+ 'percentage': float(count / len(df) * 100)
144
+ })
145
+
146
+ # Sort and store results
147
+ cooccurrence_pairs.sort(key=lambda x: x['cooccurrence_count'], reverse=True)
148
+ stats['cooccurrence_analysis'] = cooccurrence_pairs[:20] # Top 20 pairs
149
+
150
+ print(f" Found {len(cooccurrence_pairs)} co-occurrence pairs")
151
+ print(" Top 5 co-occurrence pairs:")
152
+ for i, pair in enumerate(cooccurrence_pairs[:5]):
153
+ print(f" {i+1}. {pair['emergency_keyword']} + {pair['treatment_keyword']}: {pair['cooccurrence_count']} ({pair['percentage']:.1f}%)")
154
+
155
+ # Step 7: Path B validation metrics
156
+ print("\n7️⃣ Validating Path B strategy effectiveness...")
157
+
158
+ # Compute keyword density with progress bar
159
+ print(" Computing keyword density...")
160
+ with tqdm(total=2, desc="Density calculation") as pbar:
161
+ emergency_density = calculate_density(
162
+ emergency_matrix.sum(axis=1),
163
+ df['text_length']
164
+ )
165
+ pbar.update(1)
166
+
167
+ treatment_density = calculate_density(
168
+ treatment_matrix.sum(axis=1),
169
+ df['text_length']
170
+ )
171
+ pbar.update(1)
172
+
173
+ # Store density in dataframe for visualization
174
+ df['emergency_keyword_density'] = emergency_density
175
+ df['treatment_keyword_density'] = treatment_density
176
+
177
+ # Calculate statistics with the new density metrics
178
+ stats['path_b_validation'] = {
179
+ 'avg_emergency_density': float(np.mean(emergency_density)),
180
+ 'avg_treatment_density': float(np.mean(treatment_density)),
181
+ 'high_density_records': int(sum(
182
+ (emergency_density >= np.percentile(emergency_density, 75)) &
183
+ (treatment_density >= np.percentile(treatment_density, 75))
184
+ )),
185
+ 'precision_estimate': float(sum(
186
+ (emergency_density > 0) & (treatment_density > 0)
187
+ ) / len(df))
188
+ }
189
+
190
+ # Print detailed results
191
+ print("\n Results:")
192
+ print(f" - Average emergency keyword density (per 1000 words): {stats['path_b_validation']['avg_emergency_density']:.2f}")
193
+ print(f" - Average treatment keyword density (per 1000 words): {stats['path_b_validation']['avg_treatment_density']:.2f}")
194
+ print(f" - High-density records (top 25% in both): {stats['path_b_validation']['high_density_records']}")
195
+ print(f" - Precision estimate: {stats['path_b_validation']['precision_estimate']:.2f}")
196
+
197
+ # Sample distribution analysis
198
+ print("\n Density Distribution:")
199
+ density_counts = pd.DataFrame({
200
+ 'emergency': pd.qcut(emergency_density, q=4, labels=['Low', 'Medium-Low', 'Medium-High', 'High']),
201
+ 'treatment': pd.qcut(treatment_density, q=4, labels=['Low', 'Medium-Low', 'Medium-High', 'High'])
202
+ }).value_counts().head()
203
+ print(" Top 5 density combinations (emergency, treatment):")
204
+ for (em, tr), count in density_counts.items():
205
+ print(f" - {count} documents have {em} emergency and {tr} treatment density")
206
+
207
+ # Visualization
208
+ print("\n8️⃣ Generating visualizations...")
209
+ output_plots = output_dir / "plots"
210
+ output_plots.mkdir(parents=True, exist_ok=True)
211
+
212
+ # 1. Keyword density scatter plot with improved visualization
213
+ plt.figure(figsize=(12, 8))
214
+ plt.scatter(
215
+ emergency_density,
216
+ treatment_density,
217
+ alpha=0.6,
218
+ c=np.log1p(df['text_length']),
219
+ cmap='viridis'
220
+ )
221
+ plt.colorbar(label='Log Text Length')
222
+ plt.xlabel('Emergency Keyword Density (per 1000 words)')
223
+ plt.ylabel('Treatment Keyword Density (per 1000 words)')
224
+ plt.title('Emergency vs Treatment Keyword Density (Optimized)')
225
+ plt.grid(True, alpha=0.3)
226
+
227
+ # Add mean lines
228
+ plt.axvline(x=np.mean(emergency_density), color='r', linestyle='--', alpha=0.5, label='Mean Emergency Density')
229
+ plt.axhline(y=np.mean(treatment_density), color='g', linestyle='--', alpha=0.5, label='Mean Treatment Density')
230
+ plt.legend()
231
+
232
+ plt.savefig(output_plots / "keyword_density_scatter_opt.png", bbox_inches='tight', dpi=300)
233
+ plt.close()
234
+
235
+ # Save comprehensive statistics
236
+ print("\n9️⃣ Saving analysis results...")
237
+ stats_dir = output_dir / "stats"
238
+ stats_dir.mkdir(parents=True, exist_ok=True)
239
+
240
+ with open(stats_dir / "treatment_analysis_comprehensive_opt.json", 'w', encoding='utf-8') as f:
241
+ json.dump(stats, f, indent=2, ensure_ascii=False)
242
+
243
+ print(f"✅ Treatment subset analysis complete! (Optimized Version)")
244
+ print(f" Results saved to: {output_dir}")
245
+ print(f" Plots: {output_plots}")
246
+ print(f" Statistics: {stats_dir}")
247
+
248
+ return stats
249
+
250
+ if __name__ == "__main__":
251
+ # Configuration for optimized version
252
+ treatment_file = "../dataset/emergency_treatment/emergency_treatment_subset_opt.csv"
253
+ emergency_keywords = "../keywords/emergency_keywords.txt"
254
+ treatment_keywords = "../keywords/treatment_keywords.txt"
255
+ output_directory = "../analysis_treatment_opt"
256
+
257
+ # Run analysis
258
+ results = analyze_treatment_subset(
259
+ treatment_file,
260
+ emergency_keywords,
261
+ treatment_keywords,
262
+ output_directory
263
+ )