Spaces:
Sleeping
refactor: migrate special terms to JSON configuration
Browse filesBREAKING CHANGE: Move hardcoded special terms mapping to external JSON files
1. Create New Configuration Files:
- Add special_terms_emergency.json
- Organize emergency terms by categories (cardiac, respiratory, etc.)
- Include all existing mappings with standardized structure
- Add special_terms_treatment.json
- Organize treatment terms by categories (imaging, medications, etc.)
- Maintain all existing term variants
2. Update Processing Scripts:
- Modify 01_filter_emergency_opt.py:
- Load terms from JSON configuration
- Add term standardization
- Implement deduplication
- Preserve category information
- Modify 02_filter_treatment_opt.py:
- Similar updates for treatment terms
- Maintain consistent processing logic
3. New Features:
- Term standardization: Convert variants to standard form
- Deduplication: Remove repeated terms while preserving order
- Category-aware: Support for term categorization
- Improved maintainability: Configuration separated from code
4. Technical Details:
- Use pathlib for file path handling
- JSON structure supports hierarchical organization
- Maintain backward compatibility
- Add type hints for better code clarity
Testing:
- Verify JSON format
- Confirm all mappings migrated correctly
- Check term standardization
- Validate deduplication logic
- dataset/keywords/special_terms_emergency.json +26 -0
- dataset/keywords/special_terms_treatment.json +25 -0
- dataset/scripts/01_filter_emergency_opt.py +37 -27
- dataset/scripts/02_filter_treatment_opt.py +131 -0
- dataset/scripts/commit_message_20250726_special_terms.txt +39 -0
- dataset/scripts/compare_subsets_opt.py +124 -0
- dataset/scripts/data_explorer_opt.py +118 -0
- dataset/scripts/data_explorer_treatment_opt.py +263 -0
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cardiac": {
|
3 |
+
"mi": ["mi", "m.i.", "myocardial infarction", "MI"],
|
4 |
+
"acs": ["acs", "ACS", "acute coronary syndrome"]
|
5 |
+
},
|
6 |
+
"respiratory": {
|
7 |
+
"ards": ["ards", "ARDS", "acute respiratory distress syndrome"],
|
8 |
+
"respiratory_failure": ["respiratory failure", "resp failure", "RF"]
|
9 |
+
},
|
10 |
+
"neurological": {
|
11 |
+
"loc": ["loc", "LOC", "loss of consciousness"],
|
12 |
+
"cva": ["cva", "CVA", "stroke", "cerebrovascular accident"]
|
13 |
+
},
|
14 |
+
"shock": {
|
15 |
+
"shock": ["shock", "circulatory failure"],
|
16 |
+
"septic_shock": ["septic shock", "sepsis induced shock"]
|
17 |
+
},
|
18 |
+
"bleeding": {
|
19 |
+
"gi_bleed": ["gi bleed", "gi bleeding", "gastrointestinal hemorrhage", "GI hemorrhage"],
|
20 |
+
"hemorrhage": ["hemorrhage", "bleeding", "blood loss"]
|
21 |
+
},
|
22 |
+
"vital_signs": {
|
23 |
+
"hypotension": ["hypotension", "low bp", "low blood pressure"],
|
24 |
+
"tachycardia": ["tachycardia", "elevated heart rate", "fast heart rate"]
|
25 |
+
}
|
26 |
+
}
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"imaging": {
|
3 |
+
"x-ray": ["x-ray", "x ray", "xray", "XR"],
|
4 |
+
"ct": ["ct", "ct-scan", "cat scan", "computed tomography"],
|
5 |
+
"us": ["us", "u/s", "ultrasound", "sonography"]
|
6 |
+
},
|
7 |
+
"medications": {
|
8 |
+
"iv": ["iv", "i.v.", "intravenous"],
|
9 |
+
"im": ["im", "i.m.", "intramuscular"],
|
10 |
+
"po": ["po", "p.o.", "per os", "by mouth"]
|
11 |
+
},
|
12 |
+
"procedures": {
|
13 |
+
"cpr": ["cpr", "CPR", "cardiopulmonary resuscitation"],
|
14 |
+
"intubation": ["intubation", "ETT", "endotracheal tube"],
|
15 |
+
"cardioversion": ["cardioversion", "electrical cardioversion"]
|
16 |
+
},
|
17 |
+
"monitoring": {
|
18 |
+
"ecg": ["ecg", "ekg", "electrocardiogram"],
|
19 |
+
"monitoring": ["monitoring", "continuous observation"]
|
20 |
+
},
|
21 |
+
"ventilation": {
|
22 |
+
"bipap": ["bipap", "BiPAP", "bi-level positive airway pressure"],
|
23 |
+
"cpap": ["cpap", "CPAP", "continuous positive airway pressure"]
|
24 |
+
}
|
25 |
+
}
|
@@ -1,36 +1,20 @@
|
|
1 |
import os
|
2 |
import re
|
|
|
3 |
import pandas as pd
|
|
|
4 |
|
5 |
-
# Medical term processor class for handling special terms
|
6 |
class MedicalTermProcessor:
|
7 |
def __init__(self):
|
8 |
-
#
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
'acs': ['acs', 'ACS', 'acute coronary syndrome'],
|
13 |
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
# Neurological
|
19 |
-
'loc': ['loc', 'LOC', 'loss of consciousness'],
|
20 |
-
'cva': ['cva', 'CVA', 'stroke', 'cerebrovascular accident'],
|
21 |
-
|
22 |
-
# Shock States
|
23 |
-
'shock': ['shock', 'circulatory failure'],
|
24 |
-
'septic_shock': ['septic shock', 'sepsis induced shock'],
|
25 |
-
|
26 |
-
# Bleeding
|
27 |
-
'gi_bleed': ['gi bleed', 'gi bleeding', 'gastrointestinal hemorrhage', 'GI hemorrhage'],
|
28 |
-
'hemorrhage': ['hemorrhage', 'bleeding', 'blood loss'],
|
29 |
-
|
30 |
-
# Vital Signs
|
31 |
-
'hypotension': ['hypotension', 'low bp', 'low blood pressure'],
|
32 |
-
'tachycardia': ['tachycardia', 'elevated heart rate', 'fast heart rate']
|
33 |
-
}
|
34 |
|
35 |
def get_all_variants(self):
|
36 |
"""Get all term variants including special terms"""
|
@@ -39,6 +23,32 @@ class MedicalTermProcessor:
|
|
39 |
variants.extend(term_list)
|
40 |
return variants
|
41 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
# Function: Load keywords and print progress
|
43 |
def load_keywords(path, processor):
|
44 |
print(f"📥 Loading keywords from: {path}")
|
@@ -70,7 +80,7 @@ df["matched"] = (
|
|
70 |
df["clean_text"]
|
71 |
.fillna("") # Convert NaN to empty string
|
72 |
.str.findall(pattern, flags=re.IGNORECASE)
|
73 |
-
.apply(lambda
|
74 |
)
|
75 |
df["has_emergency"] = df["matched"].str.len() > 0
|
76 |
|
|
|
1 |
import os
|
2 |
import re
|
3 |
+
import json
|
4 |
import pandas as pd
|
5 |
+
from pathlib import Path
|
6 |
|
|
|
7 |
class MedicalTermProcessor:
|
8 |
def __init__(self):
|
9 |
+
# Load emergency special terms from JSON
|
10 |
+
keywords_dir = Path("../keywords")
|
11 |
+
with open(keywords_dir / "special_terms_emergency.json", "r") as f:
|
12 |
+
self.emergency_terms_by_category = json.load(f)
|
|
|
13 |
|
14 |
+
# Flatten the nested structure for easy lookup
|
15 |
+
self.emergency_special_terms = {}
|
16 |
+
for category in self.emergency_terms_by_category.values():
|
17 |
+
self.emergency_special_terms.update(category)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
|
19 |
def get_all_variants(self):
|
20 |
"""Get all term variants including special terms"""
|
|
|
23 |
variants.extend(term_list)
|
24 |
return variants
|
25 |
|
26 |
+
def standardize_term(self, term: str) -> str:
|
27 |
+
"""Convert a term to its standard form if it's a variant"""
|
28 |
+
term_lower = term.lower()
|
29 |
+
for standard_term, variants in self.emergency_special_terms.items():
|
30 |
+
if term_lower in [v.lower() for v in variants]:
|
31 |
+
return standard_term
|
32 |
+
return term
|
33 |
+
|
34 |
+
def process_matches(self, matches: list) -> str:
|
35 |
+
"""Process matches to standardize terms and remove duplicates"""
|
36 |
+
if not matches:
|
37 |
+
return ""
|
38 |
+
|
39 |
+
# Standardize terms
|
40 |
+
standardized = [self.standardize_term(match) for match in matches]
|
41 |
+
|
42 |
+
# Remove duplicates while preserving order
|
43 |
+
seen = set()
|
44 |
+
unique_matches = []
|
45 |
+
for term in standardized:
|
46 |
+
if term.lower() not in seen:
|
47 |
+
unique_matches.append(term)
|
48 |
+
seen.add(term.lower())
|
49 |
+
|
50 |
+
return "|".join(unique_matches)
|
51 |
+
|
52 |
# Function: Load keywords and print progress
|
53 |
def load_keywords(path, processor):
|
54 |
print(f"📥 Loading keywords from: {path}")
|
|
|
80 |
df["clean_text"]
|
81 |
.fillna("") # Convert NaN to empty string
|
82 |
.str.findall(pattern, flags=re.IGNORECASE)
|
83 |
+
.apply(lambda matches: processor.process_matches(matches)) # Use new process_matches method
|
84 |
)
|
85 |
df["has_emergency"] = df["matched"].str.len() > 0
|
86 |
|
@@ -0,0 +1,131 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import re
|
3 |
+
import json
|
4 |
+
import pandas as pd
|
5 |
+
from pathlib import Path
|
6 |
+
|
7 |
+
class MedicalTermProcessor:
|
8 |
+
def __init__(self):
|
9 |
+
# Load treatment special terms from JSON
|
10 |
+
keywords_dir = Path("../keywords")
|
11 |
+
with open(keywords_dir / "special_terms_treatment.json", "r") as f:
|
12 |
+
self.treatment_terms_by_category = json.load(f)
|
13 |
+
|
14 |
+
# Flatten the nested structure for easy lookup
|
15 |
+
self.treatment_special_terms = {}
|
16 |
+
for category in self.treatment_terms_by_category.values():
|
17 |
+
self.treatment_special_terms.update(category)
|
18 |
+
|
19 |
+
def get_all_variants(self):
|
20 |
+
"""Get all term variants including special terms"""
|
21 |
+
variants = []
|
22 |
+
for term_list in self.treatment_special_terms.values():
|
23 |
+
variants.extend(term_list)
|
24 |
+
return variants
|
25 |
+
|
26 |
+
def standardize_term(self, term: str) -> str:
|
27 |
+
"""Convert a term to its standard form if it's a variant"""
|
28 |
+
term_lower = term.lower()
|
29 |
+
for standard_term, variants in self.treatment_special_terms.items():
|
30 |
+
if term_lower in [v.lower() for v in variants]:
|
31 |
+
return standard_term
|
32 |
+
return term
|
33 |
+
|
34 |
+
def process_matches(self, matches: list) -> str:
|
35 |
+
"""Process matches to standardize terms and remove duplicates"""
|
36 |
+
if not matches:
|
37 |
+
return ""
|
38 |
+
|
39 |
+
# Standardize terms
|
40 |
+
standardized = [self.standardize_term(match) for match in matches]
|
41 |
+
|
42 |
+
# Remove duplicates while preserving order
|
43 |
+
seen = set()
|
44 |
+
unique_matches = []
|
45 |
+
for term in standardized:
|
46 |
+
if term.lower() not in seen:
|
47 |
+
unique_matches.append(term)
|
48 |
+
seen.add(term.lower())
|
49 |
+
|
50 |
+
return "|".join(unique_matches)
|
51 |
+
|
52 |
+
def load_keywords(path, processor):
|
53 |
+
"""Load and preprocess treatment keywords"""
|
54 |
+
print(f"📥 Loading keywords from: {path}")
|
55 |
+
|
56 |
+
# Load basic keywords
|
57 |
+
with open(path, "r", encoding="utf-8") as f:
|
58 |
+
basic_kws = [line.strip() for line in f if line.strip()]
|
59 |
+
|
60 |
+
# Add special term variants
|
61 |
+
special_kws = processor.get_all_variants()
|
62 |
+
all_kws = list(set(basic_kws + special_kws)) # Remove duplicates
|
63 |
+
|
64 |
+
print(f" Loaded {len(all_kws)} keywords (including variants)")
|
65 |
+
return all_kws
|
66 |
+
|
67 |
+
# Step 1: Read optimized emergency subset
|
68 |
+
print("1️⃣ Reading optimized emergency subset...")
|
69 |
+
emergency_path = "../dataset/emergency/emergency_subset_opt.jsonl"
|
70 |
+
df = pd.read_json(emergency_path, lines=True)
|
71 |
+
print(f" Loaded {len(df)} emergency records")
|
72 |
+
print(f" Contains emergency keywords in 'matched' column")
|
73 |
+
|
74 |
+
# Step 2: Load treatment keywords and match
|
75 |
+
print("2️⃣ Loading treatment keywords and matching...")
|
76 |
+
processor = MedicalTermProcessor()
|
77 |
+
keywords = load_keywords("../keywords/treatment_keywords.txt", processor)
|
78 |
+
pattern = r"\b(?:" + "|".join(map(re.escape, keywords)) + r")\b"
|
79 |
+
|
80 |
+
# Step 3: Process text and match keywords
|
81 |
+
print("3️⃣ Processing text and matching keywords...")
|
82 |
+
# Match treatment keywords and add metadata columns
|
83 |
+
df["treatment_matched"] = (
|
84 |
+
df["clean_text"]
|
85 |
+
.fillna("") # Convert NaN to empty string
|
86 |
+
.str.findall(pattern, flags=re.IGNORECASE)
|
87 |
+
.apply(lambda matches: processor.process_matches(matches)) # Use new process_matches method
|
88 |
+
)
|
89 |
+
df["has_treatment"] = df["treatment_matched"].str.len() > 0
|
90 |
+
|
91 |
+
# Add metadata columns for future use
|
92 |
+
df["type"] = "treatment" # Document type identifier
|
93 |
+
df["condition"] = "" # Reserved for future condition mapping
|
94 |
+
|
95 |
+
# Verify columns
|
96 |
+
print(" Verifying columns...")
|
97 |
+
print(f" - Emergency keywords column (matched): {df['matched'].notna().sum()} records")
|
98 |
+
print(f" - Treatment keywords column (treatment_matched): {df['treatment_matched'].notna().sum()} records")
|
99 |
+
|
100 |
+
# Calculate statistics
|
101 |
+
cnt_treat = df["has_treatment"].sum()
|
102 |
+
avg_matches = (
|
103 |
+
df[df["has_treatment"]]["treatment_matched"]
|
104 |
+
.str.count(r"\|")
|
105 |
+
.add(1)
|
106 |
+
.mean()
|
107 |
+
)
|
108 |
+
|
109 |
+
print(f" Found {cnt_treat} treatment-related records")
|
110 |
+
print(f" Average treatment keywords per record: {avg_matches:.2f}")
|
111 |
+
|
112 |
+
# Step 4: Save treatment subset
|
113 |
+
print("4️⃣ Saving treatment subset...")
|
114 |
+
out_dir = "../dataset/emergency_treatment"
|
115 |
+
os.makedirs(out_dir, exist_ok=True)
|
116 |
+
|
117 |
+
# Select records with treatment keywords
|
118 |
+
subset = df[df["has_treatment"]].copy() # Use copy to avoid SettingWithCopyWarning
|
119 |
+
|
120 |
+
# Verify final subset columns
|
121 |
+
print(" Final subset columns:")
|
122 |
+
print(f" - Emergency keywords (matched): {subset['matched'].notna().sum()} records")
|
123 |
+
print(f" - Treatment keywords (treatment_matched): {subset['treatment_matched'].notna().sum()} records")
|
124 |
+
|
125 |
+
# Save with _opt suffix
|
126 |
+
subset.to_json(f"{out_dir}/emergency_treatment_subset_opt.jsonl", orient="records", lines=True)
|
127 |
+
subset.to_csv(f"{out_dir}/emergency_treatment_subset_opt.csv", index=False)
|
128 |
+
|
129 |
+
print(f"✅ Generated optimized treatment subset with {len(subset)} records")
|
130 |
+
print(f" Saved in: {out_dir}")
|
131 |
+
print(f" Contains both emergency and treatment keywords")
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
refactor: migrate special terms to JSON configuration
|
2 |
+
|
3 |
+
BREAKING CHANGE: Move hardcoded special terms mapping to external JSON files
|
4 |
+
|
5 |
+
1. Create New Configuration Files:
|
6 |
+
- Add special_terms_emergency.json
|
7 |
+
- Organize emergency terms by categories (cardiac, respiratory, etc.)
|
8 |
+
- Include all existing mappings with standardized structure
|
9 |
+
- Add special_terms_treatment.json
|
10 |
+
- Organize treatment terms by categories (imaging, medications, etc.)
|
11 |
+
- Maintain all existing term variants
|
12 |
+
|
13 |
+
2. Update Processing Scripts:
|
14 |
+
- Modify 01_filter_emergency_opt.py:
|
15 |
+
- Load terms from JSON configuration
|
16 |
+
- Add term standardization
|
17 |
+
- Implement deduplication
|
18 |
+
- Preserve category information
|
19 |
+
- Modify 02_filter_treatment_opt.py:
|
20 |
+
- Similar updates for treatment terms
|
21 |
+
- Maintain consistent processing logic
|
22 |
+
|
23 |
+
3. New Features:
|
24 |
+
- Term standardization: Convert variants to standard form
|
25 |
+
- Deduplication: Remove repeated terms while preserving order
|
26 |
+
- Category-aware: Support for term categorization
|
27 |
+
- Improved maintainability: Configuration separated from code
|
28 |
+
|
29 |
+
4. Technical Details:
|
30 |
+
- Use pathlib for file path handling
|
31 |
+
- JSON structure supports hierarchical organization
|
32 |
+
- Maintain backward compatibility
|
33 |
+
- Add type hints for better code clarity
|
34 |
+
|
35 |
+
Testing:
|
36 |
+
- Verify JSON format
|
37 |
+
- Confirm all mappings migrated correctly
|
38 |
+
- Check term standardization
|
39 |
+
- Validate deduplication logic
|
@@ -0,0 +1,124 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# /scripts/compare_subsets_opt.py
|
2 |
+
import pandas as pd
|
3 |
+
from pathlib import Path
|
4 |
+
from datetime import datetime
|
5 |
+
|
6 |
+
def load_and_compare_subsets(format_type='csv'):
|
7 |
+
"""
|
8 |
+
Load and compare the first 10 records from both optimized subsets
|
9 |
+
|
10 |
+
Args:
|
11 |
+
format_type (str): 'csv' or 'jsonl'
|
12 |
+
"""
|
13 |
+
# Prepare output file
|
14 |
+
output_dir = Path("../analysis")
|
15 |
+
output_dir.mkdir(exist_ok=True)
|
16 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
17 |
+
output_file = output_dir / f"subset_comparison_first10_records_{timestamp}.md"
|
18 |
+
|
19 |
+
# Initialize markdown content
|
20 |
+
md_content = []
|
21 |
+
md_content.append("# Optimized Subsets Comparison Report\n")
|
22 |
+
md_content.append(f"Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
|
23 |
+
md_content.append(f"File format: {format_type.upper()}\n")
|
24 |
+
|
25 |
+
# Set file paths based on format
|
26 |
+
if format_type == 'csv':
|
27 |
+
emergency_path = "../dataset/emergency/emergency_subset_opt.csv"
|
28 |
+
treatment_path = "../dataset/emergency_treatment/emergency_treatment_subset_opt.csv"
|
29 |
+
# Load CSV files
|
30 |
+
emergency_df = pd.read_csv(emergency_path)
|
31 |
+
treatment_df = pd.read_csv(treatment_path)
|
32 |
+
else: # jsonl
|
33 |
+
emergency_path = "../dataset/emergency/emergency_subset_opt.jsonl"
|
34 |
+
treatment_path = "../dataset/emergency_treatment/emergency_treatment_subset_opt.jsonl"
|
35 |
+
# Load JSONL files
|
36 |
+
emergency_df = pd.read_json(emergency_path, lines=True)
|
37 |
+
treatment_df = pd.read_json(treatment_path, lines=True)
|
38 |
+
|
39 |
+
# Print and save basic statistics
|
40 |
+
print("\n📊 Basic Statistics:")
|
41 |
+
print("-" * 40)
|
42 |
+
md_content.append("\n## Basic Statistics\n")
|
43 |
+
|
44 |
+
stats = [
|
45 |
+
f"- Emergency subset total records: {len(emergency_df)}",
|
46 |
+
f"- Emergency+Treatment subset total records: {len(treatment_df)}",
|
47 |
+
f"- Avg Emergency Text Length: {emergency_df['clean_text'].str.len().mean():.2f}",
|
48 |
+
f"- Avg Treatment Text Length: {treatment_df['clean_text'].str.len().mean():.2f}"
|
49 |
+
]
|
50 |
+
|
51 |
+
# Calculate average keywords using pattern
|
52 |
+
pattern = r'\|'
|
53 |
+
emergency_avg = emergency_df['matched'].str.count(pattern).add(1).mean()
|
54 |
+
treatment_avg = treatment_df['matched'].str.count(pattern).add(1).mean()
|
55 |
+
|
56 |
+
stats.extend([
|
57 |
+
f"- Avg Emergency Keywords: {emergency_avg:.2f}",
|
58 |
+
f"- Avg Treatment Keywords: {treatment_avg:.2f}"
|
59 |
+
])
|
60 |
+
|
61 |
+
# Print to console and add to markdown
|
62 |
+
for stat in stats:
|
63 |
+
print(stat.replace("- ", ""))
|
64 |
+
md_content.extend(stats)
|
65 |
+
|
66 |
+
# Compare first 10 records from Emergency subset
|
67 |
+
print("\n🔍 First 10 records from Emergency Subset:")
|
68 |
+
print("-" * 80)
|
69 |
+
md_content.append("\n## Emergency Subset (First 10 Records)\n")
|
70 |
+
|
71 |
+
for idx, row in emergency_df.head(10).iterrows():
|
72 |
+
print(f"\nRecord #{idx+1}")
|
73 |
+
print(f"Text preview: {row['clean_text'][:100]}...")
|
74 |
+
print(f"Matched keywords: {row['matched']}")
|
75 |
+
print(f"Text length: {len(row['clean_text'])}")
|
76 |
+
print("-" * 40)
|
77 |
+
|
78 |
+
md_content.extend([
|
79 |
+
f"\n### Record {idx+1}",
|
80 |
+
"```",
|
81 |
+
f"Text preview: {row['clean_text'][:100]}...",
|
82 |
+
f"Matched keywords: {row['matched']}",
|
83 |
+
f"Text length: {len(row['clean_text'])}",
|
84 |
+
"```\n"
|
85 |
+
])
|
86 |
+
|
87 |
+
# Compare first 10 records from Emergency+Treatment subset
|
88 |
+
print("\n🔍 First 10 records from Emergency+Treatment Subset:")
|
89 |
+
print("-" * 80)
|
90 |
+
md_content.append("\n## Emergency+Treatment Subset (First 10 Records)\n")
|
91 |
+
|
92 |
+
for idx, row in treatment_df.head(10).iterrows():
|
93 |
+
print(f"\nRecord #{idx+1}")
|
94 |
+
print(f"Text preview: {row['clean_text'][:100]}...")
|
95 |
+
print(f"Emergency keywords: {row['matched']}")
|
96 |
+
print(f"Treatment keywords: {row['treatment_matched']}")
|
97 |
+
print(f"Text length: {len(row['clean_text'])}")
|
98 |
+
print("-" * 40)
|
99 |
+
|
100 |
+
md_content.extend([
|
101 |
+
f"\n### Record {idx+1}",
|
102 |
+
"```",
|
103 |
+
f"Text preview: {row['clean_text'][:100]}...",
|
104 |
+
f"Emergency keywords: {row['matched']}",
|
105 |
+
f"Treatment keywords: {row['treatment_matched']}",
|
106 |
+
f"Text length: {len(row['clean_text'])}",
|
107 |
+
"```\n"
|
108 |
+
])
|
109 |
+
|
110 |
+
# Save markdown content
|
111 |
+
with open(output_file, 'w', encoding='utf-8') as f:
|
112 |
+
f.write('\n'.join(md_content))
|
113 |
+
|
114 |
+
print(f"\n✅ Comparison complete!")
|
115 |
+
print(f"Report saved to: {output_file}")
|
116 |
+
|
117 |
+
if __name__ == "__main__":
|
118 |
+
# Compare using CSV format
|
119 |
+
print("\nComparing CSV files...")
|
120 |
+
load_and_compare_subsets('csv')
|
121 |
+
|
122 |
+
# Compare using JSONL format
|
123 |
+
print("\nComparing JSONL files...")
|
124 |
+
load_and_compare_subsets('jsonl')
|
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# /scripts/data_explorer_opt.py
|
2 |
+
import pandas as pd
|
3 |
+
import matplotlib.pyplot as plt
|
4 |
+
import seaborn as sns
|
5 |
+
import numpy as np
|
6 |
+
from pathlib import Path
|
7 |
+
import json
|
8 |
+
|
9 |
+
def analyze_subset(file_path, keywords_path, output_dir="analysis", subset_name="emergency"):
|
10 |
+
"""Analyze subset data quality and distribution"""
|
11 |
+
print(f"\n{'='*50}")
|
12 |
+
print(f"Starting optimized dataset analysis: {file_path}")
|
13 |
+
print(f"Using keywords file: {keywords_path}")
|
14 |
+
print(f"Output directory: {output_dir}")
|
15 |
+
print(f"{'='*50}\n")
|
16 |
+
|
17 |
+
# Load data
|
18 |
+
print("1️⃣ Loading data...")
|
19 |
+
df = pd.read_csv(file_path)
|
20 |
+
output_dir = Path(output_dir)
|
21 |
+
|
22 |
+
# 1. Basic statistics
|
23 |
+
print("\n2️⃣ Calculating basic statistics...")
|
24 |
+
total = len(df)
|
25 |
+
df['text_length'] = df['clean_text'].str.len()
|
26 |
+
avg_len = df['text_length'].mean()
|
27 |
+
print(f"Total records: {total}")
|
28 |
+
print(f"Average text length: {avg_len:.2f}")
|
29 |
+
|
30 |
+
# Initialize statistics dictionary with native Python types
|
31 |
+
stats = {
|
32 |
+
'basic_statistics': {
|
33 |
+
'total_records': int(total),
|
34 |
+
'avg_length': float(avg_len)
|
35 |
+
},
|
36 |
+
'keyword_statistics': {}
|
37 |
+
}
|
38 |
+
|
39 |
+
# 2. Keyword analysis
|
40 |
+
print("\n3️⃣ Performing keyword analysis...")
|
41 |
+
with open(keywords_path, 'r') as f:
|
42 |
+
keywords = [line.strip() for line in f if line.strip()]
|
43 |
+
print(f"Loaded {len(keywords)} keywords")
|
44 |
+
|
45 |
+
# Count keywords and store in stats
|
46 |
+
for keyword in keywords:
|
47 |
+
cnt = df['clean_text'].str.contains(keyword, case=False).sum()
|
48 |
+
stats['keyword_statistics'][keyword] = int(cnt)
|
49 |
+
print(f" - {keyword}: {cnt} records")
|
50 |
+
|
51 |
+
# 3. Visualization
|
52 |
+
print("\n4️⃣ Generating visualizations...")
|
53 |
+
output_path = Path(output_dir) / "plots"
|
54 |
+
output_path.mkdir(parents=True, exist_ok=True)
|
55 |
+
print(f"Charts will be saved in: {output_path}")
|
56 |
+
|
57 |
+
# 3.1 Keyword distribution chart
|
58 |
+
print(" - Generating keyword distribution chart...")
|
59 |
+
plt.figure(figsize=(15, 8))
|
60 |
+
plt.bar(stats['keyword_statistics'].keys(), stats['keyword_statistics'].values())
|
61 |
+
plt.xticks(rotation=45, ha='right')
|
62 |
+
plt.title(f'Keyword Distribution for {subset_name.capitalize()} Subset (Optimized)')
|
63 |
+
plt.xlabel('Keywords')
|
64 |
+
plt.ylabel('Match Count')
|
65 |
+
plt.savefig(output_path / f"keyword_distribution_{subset_name}_subset_opt.png", bbox_inches='tight')
|
66 |
+
plt.close()
|
67 |
+
|
68 |
+
# 3.2 Text length distribution
|
69 |
+
print(" - Generating text length distribution...")
|
70 |
+
plt.figure(figsize=(10, 6))
|
71 |
+
df['text_length'].hist(bins=50)
|
72 |
+
plt.title(f'Text Length Distribution ({subset_name.capitalize()} Subset - Optimized)')
|
73 |
+
plt.xlabel('Text Length')
|
74 |
+
plt.ylabel('Frequency')
|
75 |
+
plt.savefig(output_path / f"text_length_dist_{subset_name}_subset_opt.png", bbox_inches='tight')
|
76 |
+
plt.close()
|
77 |
+
|
78 |
+
# 3.3 Keyword co-occurrence analysis
|
79 |
+
print(" - Generating keyword co-occurrence heatmap...")
|
80 |
+
cooccurrence_matrix = np.zeros((len(keywords), len(keywords)))
|
81 |
+
for text in df['clean_text']:
|
82 |
+
present_keywords = [k for k in keywords if k.lower() in text.lower()]
|
83 |
+
for i, k1 in enumerate(present_keywords):
|
84 |
+
for j, k2 in enumerate(present_keywords):
|
85 |
+
if i != j:
|
86 |
+
cooccurrence_matrix[keywords.index(k1)][keywords.index(k2)] += 1
|
87 |
+
|
88 |
+
plt.figure(figsize=(12, 8))
|
89 |
+
sns.heatmap(cooccurrence_matrix,
|
90 |
+
xticklabels=keywords,
|
91 |
+
yticklabels=keywords,
|
92 |
+
cmap='YlOrRd')
|
93 |
+
plt.title(f'Keyword Co-occurrence Heatmap ({subset_name.capitalize()} Subset - Optimized)')
|
94 |
+
plt.xticks(rotation=45, ha='right')
|
95 |
+
plt.tight_layout()
|
96 |
+
plt.savefig(output_path / f"keyword_cooccurrence_{subset_name}_subset_opt.png", bbox_inches='tight')
|
97 |
+
plt.close()
|
98 |
+
|
99 |
+
# 4. Save statistics
|
100 |
+
print("\n5️⃣ Saving statistics...")
|
101 |
+
stats_path = Path(output_dir) / "stats"
|
102 |
+
stats_path.mkdir(parents=True, exist_ok=True)
|
103 |
+
stats_file = stats_path / f"analysis_stats_{subset_name}_subset_opt.json"
|
104 |
+
|
105 |
+
with open(stats_file, 'w', encoding='utf-8') as f:
|
106 |
+
json.dump(stats, f, indent=2, ensure_ascii=False)
|
107 |
+
print(f"Statistics saved to: {stats_file}")
|
108 |
+
|
109 |
+
print(f"\n✅ Analysis complete! All results saved to {output_dir} directory")
|
110 |
+
|
111 |
+
if __name__ == "__main__":
|
112 |
+
# Set file paths for optimized version
|
113 |
+
emergency_subset = "../dataset/emergency/emergency_subset_opt.csv"
|
114 |
+
emergency_keywords = "../keywords/emergency_keywords.txt"
|
115 |
+
output_dir = "../analysis"
|
116 |
+
|
117 |
+
# Run analysis
|
118 |
+
analyze_subset(emergency_subset, emergency_keywords, output_dir, "emergency")
|
@@ -0,0 +1,263 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# /scripts/data_explorer_treatment_opt.py
|
2 |
+
import pandas as pd
|
3 |
+
import matplotlib.pyplot as plt
|
4 |
+
import seaborn as sns
|
5 |
+
import numpy as np
|
6 |
+
from pathlib import Path
|
7 |
+
import json
|
8 |
+
import numpy as np
|
9 |
+
from tqdm import tqdm
|
10 |
+
import re
|
11 |
+
|
12 |
+
def calculate_density(matches, text_length):
|
13 |
+
"""
|
14 |
+
Calculate keyword density per 1000 words
|
15 |
+
|
16 |
+
Args:
|
17 |
+
matches: Number of keyword matches
|
18 |
+
text_length: Total text length
|
19 |
+
|
20 |
+
Returns:
|
21 |
+
float: Density per 1000 words
|
22 |
+
"""
|
23 |
+
return (matches / text_length) * 1000
|
24 |
+
|
25 |
+
def analyze_treatment_subset(
|
26 |
+
treatment_file_path,
|
27 |
+
emergency_keywords_path,
|
28 |
+
treatment_keywords_path,
|
29 |
+
output_dir="analysis_treatment_opt" # Updated default output directory
|
30 |
+
):
|
31 |
+
"""
|
32 |
+
Specialized analysis for optimized treatment subset focusing on:
|
33 |
+
1. Dual keyword analysis (emergency + treatment)
|
34 |
+
2. Path B effectiveness validation
|
35 |
+
3. Condition mapping data preparation
|
36 |
+
4. RAG readiness assessment
|
37 |
+
"""
|
38 |
+
print(f"\n{'='*60}")
|
39 |
+
print(f"Treatment Subset Analysis (Optimized Version)")
|
40 |
+
print(f"Treatment file: {treatment_file_path}")
|
41 |
+
print(f"Emergency keywords: {emergency_keywords_path}")
|
42 |
+
print(f"Treatment keywords: {treatment_keywords_path}")
|
43 |
+
print(f"Output directory: {output_dir}")
|
44 |
+
print(f"{'='*60}\n")
|
45 |
+
|
46 |
+
# Load data
|
47 |
+
print("1️⃣ Loading optimized treatment subset data...")
|
48 |
+
df = pd.read_csv(treatment_file_path)
|
49 |
+
output_dir = Path(output_dir)
|
50 |
+
|
51 |
+
# Load keyword lists
|
52 |
+
print("2️⃣ Loading keyword lists...")
|
53 |
+
with open(emergency_keywords_path, 'r', encoding='utf-8') as f:
|
54 |
+
emergency_keywords = [line.strip() for line in f if line.strip()]
|
55 |
+
|
56 |
+
with open(treatment_keywords_path, 'r', encoding='utf-8') as f:
|
57 |
+
treatment_keywords = [line.strip() for line in f if line.strip()]
|
58 |
+
|
59 |
+
print(f" Emergency keywords: {len(emergency_keywords)}")
|
60 |
+
print(f" Treatment keywords: {len(treatment_keywords)}")
|
61 |
+
|
62 |
+
# Basic statistics
|
63 |
+
print("\n3️⃣ Computing basic statistics...")
|
64 |
+
total_records = len(df)
|
65 |
+
df['text_length'] = df['clean_text'].str.len()
|
66 |
+
avg_length = df['text_length'].mean()
|
67 |
+
|
68 |
+
print(f" Total treatment records: {total_records}")
|
69 |
+
print(f" Average text length: {avg_length:.2f} characters")
|
70 |
+
|
71 |
+
# Initialize comprehensive statistics
|
72 |
+
stats = {
|
73 |
+
'basic_statistics': {
|
74 |
+
'total_records': int(total_records),
|
75 |
+
'avg_text_length': float(avg_length),
|
76 |
+
'emergency_keywords_count': len(emergency_keywords),
|
77 |
+
'treatment_keywords_count': len(treatment_keywords)
|
78 |
+
},
|
79 |
+
'emergency_keyword_stats': {},
|
80 |
+
'treatment_keyword_stats': {},
|
81 |
+
'cooccurrence_analysis': {},
|
82 |
+
'path_b_validation': {},
|
83 |
+
'condition_mapping_candidates': {}
|
84 |
+
}
|
85 |
+
|
86 |
+
# Emergency keyword analysis in treatment subset
|
87 |
+
print("\n4️⃣ Analyzing emergency keywords in treatment subset...")
|
88 |
+
for keyword in emergency_keywords:
|
89 |
+
count = df['clean_text'].str.contains(keyword, case=False, na=False).sum()
|
90 |
+
stats['emergency_keyword_stats'][keyword] = int(count)
|
91 |
+
print(f" Emergency: {keyword} -> {count} records")
|
92 |
+
|
93 |
+
# Treatment keyword analysis
|
94 |
+
print("\n5️⃣ Analyzing treatment keywords...")
|
95 |
+
for keyword in treatment_keywords:
|
96 |
+
count = df['clean_text'].str.contains(keyword, case=False, na=False).sum()
|
97 |
+
stats['treatment_keyword_stats'][keyword] = int(count)
|
98 |
+
print(f" Treatment: {keyword} -> {count} records")
|
99 |
+
|
100 |
+
# Step 6: Co-occurrence analysis
|
101 |
+
print("\n6️⃣ Computing keyword co-occurrence patterns...")
|
102 |
+
|
103 |
+
# Initialize matrices for full dataset
|
104 |
+
emergency_matrix = np.zeros((len(df), len(emergency_keywords)), dtype=bool)
|
105 |
+
treatment_matrix = np.zeros((len(df), len(treatment_keywords)), dtype=bool)
|
106 |
+
|
107 |
+
# Pre-process text
|
108 |
+
print(" Pre-processing text...")
|
109 |
+
df['clean_text_lower'] = df['clean_text'].fillna('').str.lower()
|
110 |
+
|
111 |
+
# Process all emergency keywords
|
112 |
+
print("\n Processing all emergency keywords...")
|
113 |
+
for i, keyword in enumerate(tqdm(emergency_keywords, desc="Emergency keywords")):
|
114 |
+
pattern = r'\b' + re.escape(keyword.lower()) + r'\b'
|
115 |
+
emergency_matrix[:, i] = df['clean_text_lower'].str.contains(pattern, regex=True, na=False)
|
116 |
+
matches = emergency_matrix[:, i].sum()
|
117 |
+
print(f" - {keyword}: {matches} matches")
|
118 |
+
|
119 |
+
# Process all treatment keywords
|
120 |
+
print("\n Processing all treatment keywords...")
|
121 |
+
for i, keyword in enumerate(tqdm(treatment_keywords, desc="Treatment keywords")):
|
122 |
+
pattern = r'\b' + re.escape(keyword.lower()) + r'\b'
|
123 |
+
treatment_matrix[:, i] = df['clean_text_lower'].str.contains(pattern, regex=True, na=False)
|
124 |
+
matches = treatment_matrix[:, i].sum()
|
125 |
+
print(f" - {keyword}: {matches} matches")
|
126 |
+
|
127 |
+
# Compute co-occurrence matrix
|
128 |
+
print("\n Computing co-occurrence matrix...")
|
129 |
+
cooc_matrix = emergency_matrix.astype(int).T @ treatment_matrix.astype(int)
|
130 |
+
print(" Computation completed successfully")
|
131 |
+
|
132 |
+
# Extract results
|
133 |
+
print(" Extracting co-occurrence pairs...")
|
134 |
+
cooccurrence_pairs = []
|
135 |
+
for i, em_kw in enumerate(emergency_keywords):
|
136 |
+
for j, tr_kw in enumerate(treatment_keywords):
|
137 |
+
count = int(cooc_matrix[i, j])
|
138 |
+
if count > 0:
|
139 |
+
cooccurrence_pairs.append({
|
140 |
+
'emergency_keyword': em_kw,
|
141 |
+
'treatment_keyword': tr_kw,
|
142 |
+
'cooccurrence_count': count,
|
143 |
+
'percentage': float(count / len(df) * 100)
|
144 |
+
})
|
145 |
+
|
146 |
+
# Sort and store results
|
147 |
+
cooccurrence_pairs.sort(key=lambda x: x['cooccurrence_count'], reverse=True)
|
148 |
+
stats['cooccurrence_analysis'] = cooccurrence_pairs[:20] # Top 20 pairs
|
149 |
+
|
150 |
+
print(f" Found {len(cooccurrence_pairs)} co-occurrence pairs")
|
151 |
+
print(" Top 5 co-occurrence pairs:")
|
152 |
+
for i, pair in enumerate(cooccurrence_pairs[:5]):
|
153 |
+
print(f" {i+1}. {pair['emergency_keyword']} + {pair['treatment_keyword']}: {pair['cooccurrence_count']} ({pair['percentage']:.1f}%)")
|
154 |
+
|
155 |
+
# Step 7: Path B validation metrics
|
156 |
+
print("\n7️⃣ Validating Path B strategy effectiveness...")
|
157 |
+
|
158 |
+
# Compute keyword density with progress bar
|
159 |
+
print(" Computing keyword density...")
|
160 |
+
with tqdm(total=2, desc="Density calculation") as pbar:
|
161 |
+
emergency_density = calculate_density(
|
162 |
+
emergency_matrix.sum(axis=1),
|
163 |
+
df['text_length']
|
164 |
+
)
|
165 |
+
pbar.update(1)
|
166 |
+
|
167 |
+
treatment_density = calculate_density(
|
168 |
+
treatment_matrix.sum(axis=1),
|
169 |
+
df['text_length']
|
170 |
+
)
|
171 |
+
pbar.update(1)
|
172 |
+
|
173 |
+
# Store density in dataframe for visualization
|
174 |
+
df['emergency_keyword_density'] = emergency_density
|
175 |
+
df['treatment_keyword_density'] = treatment_density
|
176 |
+
|
177 |
+
# Calculate statistics with the new density metrics
|
178 |
+
stats['path_b_validation'] = {
|
179 |
+
'avg_emergency_density': float(np.mean(emergency_density)),
|
180 |
+
'avg_treatment_density': float(np.mean(treatment_density)),
|
181 |
+
'high_density_records': int(sum(
|
182 |
+
(emergency_density >= np.percentile(emergency_density, 75)) &
|
183 |
+
(treatment_density >= np.percentile(treatment_density, 75))
|
184 |
+
)),
|
185 |
+
'precision_estimate': float(sum(
|
186 |
+
(emergency_density > 0) & (treatment_density > 0)
|
187 |
+
) / len(df))
|
188 |
+
}
|
189 |
+
|
190 |
+
# Print detailed results
|
191 |
+
print("\n Results:")
|
192 |
+
print(f" - Average emergency keyword density (per 1000 words): {stats['path_b_validation']['avg_emergency_density']:.2f}")
|
193 |
+
print(f" - Average treatment keyword density (per 1000 words): {stats['path_b_validation']['avg_treatment_density']:.2f}")
|
194 |
+
print(f" - High-density records (top 25% in both): {stats['path_b_validation']['high_density_records']}")
|
195 |
+
print(f" - Precision estimate: {stats['path_b_validation']['precision_estimate']:.2f}")
|
196 |
+
|
197 |
+
# Sample distribution analysis
|
198 |
+
print("\n Density Distribution:")
|
199 |
+
density_counts = pd.DataFrame({
|
200 |
+
'emergency': pd.qcut(emergency_density, q=4, labels=['Low', 'Medium-Low', 'Medium-High', 'High']),
|
201 |
+
'treatment': pd.qcut(treatment_density, q=4, labels=['Low', 'Medium-Low', 'Medium-High', 'High'])
|
202 |
+
}).value_counts().head()
|
203 |
+
print(" Top 5 density combinations (emergency, treatment):")
|
204 |
+
for (em, tr), count in density_counts.items():
|
205 |
+
print(f" - {count} documents have {em} emergency and {tr} treatment density")
|
206 |
+
|
207 |
+
# Visualization
|
208 |
+
print("\n8️⃣ Generating visualizations...")
|
209 |
+
output_plots = output_dir / "plots"
|
210 |
+
output_plots.mkdir(parents=True, exist_ok=True)
|
211 |
+
|
212 |
+
# 1. Keyword density scatter plot with improved visualization
|
213 |
+
plt.figure(figsize=(12, 8))
|
214 |
+
plt.scatter(
|
215 |
+
emergency_density,
|
216 |
+
treatment_density,
|
217 |
+
alpha=0.6,
|
218 |
+
c=np.log1p(df['text_length']),
|
219 |
+
cmap='viridis'
|
220 |
+
)
|
221 |
+
plt.colorbar(label='Log Text Length')
|
222 |
+
plt.xlabel('Emergency Keyword Density (per 1000 words)')
|
223 |
+
plt.ylabel('Treatment Keyword Density (per 1000 words)')
|
224 |
+
plt.title('Emergency vs Treatment Keyword Density (Optimized)')
|
225 |
+
plt.grid(True, alpha=0.3)
|
226 |
+
|
227 |
+
# Add mean lines
|
228 |
+
plt.axvline(x=np.mean(emergency_density), color='r', linestyle='--', alpha=0.5, label='Mean Emergency Density')
|
229 |
+
plt.axhline(y=np.mean(treatment_density), color='g', linestyle='--', alpha=0.5, label='Mean Treatment Density')
|
230 |
+
plt.legend()
|
231 |
+
|
232 |
+
plt.savefig(output_plots / "keyword_density_scatter_opt.png", bbox_inches='tight', dpi=300)
|
233 |
+
plt.close()
|
234 |
+
|
235 |
+
# Save comprehensive statistics
|
236 |
+
print("\n9️⃣ Saving analysis results...")
|
237 |
+
stats_dir = output_dir / "stats"
|
238 |
+
stats_dir.mkdir(parents=True, exist_ok=True)
|
239 |
+
|
240 |
+
with open(stats_dir / "treatment_analysis_comprehensive_opt.json", 'w', encoding='utf-8') as f:
|
241 |
+
json.dump(stats, f, indent=2, ensure_ascii=False)
|
242 |
+
|
243 |
+
print(f"✅ Treatment subset analysis complete! (Optimized Version)")
|
244 |
+
print(f" Results saved to: {output_dir}")
|
245 |
+
print(f" Plots: {output_plots}")
|
246 |
+
print(f" Statistics: {stats_dir}")
|
247 |
+
|
248 |
+
return stats
|
249 |
+
|
250 |
+
if __name__ == "__main__":
|
251 |
+
# Configuration for optimized version
|
252 |
+
treatment_file = "../dataset/emergency_treatment/emergency_treatment_subset_opt.csv"
|
253 |
+
emergency_keywords = "../keywords/emergency_keywords.txt"
|
254 |
+
treatment_keywords = "../keywords/treatment_keywords.txt"
|
255 |
+
output_directory = "../analysis_treatment_opt"
|
256 |
+
|
257 |
+
# Run analysis
|
258 |
+
results = analyze_treatment_subset(
|
259 |
+
treatment_file,
|
260 |
+
emergency_keywords,
|
261 |
+
treatment_keywords,
|
262 |
+
output_directory
|
263 |
+
)
|