YanBoChen commited on
Commit
d37f4b2
·
1 Parent(s): 2ee61dc

WIP: during pre-process dataset, when doing dataset_treatment exploration, some abnormality happen, thus we now create certain test script to identify the problem

Browse files
dataset/analysis/keyword_matching_test_results.json ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "special_terms_matching": [
3
+ {
4
+ "clean_text": "Patient needs an x-ray of the chest",
5
+ "category": "x-ray variants",
6
+ "matched": "x-ray"
7
+ },
8
+ {
9
+ "clean_text": "Ordered chest xray",
10
+ "category": "x-ray variants",
11
+ "matched": "xray"
12
+ },
13
+ {
14
+ "clean_text": "X ray shows pneumonia",
15
+ "category": "x-ray variants",
16
+ "matched": "X ray"
17
+ },
18
+ {
19
+ "clean_text": "XRAY negative",
20
+ "category": "x-ray variants",
21
+ "matched": "XRAY"
22
+ },
23
+ {
24
+ "clean_text": "CT scan reveals nodule",
25
+ "category": "ct-scan variants",
26
+ "matched": "CT scan"
27
+ },
28
+ {
29
+ "clean_text": "CT-scan indicates mass",
30
+ "category": "ct-scan variants",
31
+ "matched": "CT-scan"
32
+ },
33
+ {
34
+ "clean_text": "Requires ctscan urgently",
35
+ "category": "ct-scan variants",
36
+ "matched": "ctscan"
37
+ },
38
+ {
39
+ "clean_text": "CTSCAN of abdomen",
40
+ "category": "ct-scan variants",
41
+ "matched": "CTSCAN"
42
+ },
43
+ {
44
+ "clean_text": "Point-of-care testing needed",
45
+ "category": "point-of-care variants",
46
+ "matched": "Point-of-care"
47
+ },
48
+ {
49
+ "clean_text": "Point of care ultrasound",
50
+ "category": "point-of-care variants",
51
+ "matched": "Point of care"
52
+ },
53
+ {
54
+ "clean_text": "POC testing results",
55
+ "category": "point-of-care variants",
56
+ "matched": ""
57
+ },
58
+ {
59
+ "clean_text": "Ordered both x-ray and CT scan",
60
+ "category": "mixed cases",
61
+ "matched": "x-ray|CT scan"
62
+ },
63
+ {
64
+ "clean_text": "XRAY and CTSCAN negative",
65
+ "category": "mixed cases",
66
+ "matched": "XRAY|CTSCAN"
67
+ },
68
+ {
69
+ "clean_text": "Multiple point-of-care tests with x-ray",
70
+ "category": "mixed cases",
71
+ "matched": "point-of-care|x-ray"
72
+ },
73
+ {
74
+ "clean_text": "No imaging mentioned",
75
+ "category": "negative cases",
76
+ "matched": ""
77
+ },
78
+ {
79
+ "clean_text": "Regular examination only",
80
+ "category": "negative cases",
81
+ "matched": ""
82
+ },
83
+ {
84
+ "clean_text": "Laboratory tests pending",
85
+ "category": "negative cases",
86
+ "matched": ""
87
+ }
88
+ ],
89
+ "basic_matching": [
90
+ {
91
+ "clean_text": "Emergency treatment required",
92
+ "category": "simple matches",
93
+ "matched": "Emergency"
94
+ },
95
+ {
96
+ "clean_text": "Acute condition observed",
97
+ "category": "simple matches",
98
+ "matched": "Acute"
99
+ },
100
+ {
101
+ "clean_text": "Urgent care needed",
102
+ "category": "simple matches",
103
+ "matched": "Urgent"
104
+ },
105
+ {
106
+ "clean_text": "EMERGENCY situation",
107
+ "category": "case variations",
108
+ "matched": "EMERGENCY"
109
+ },
110
+ {
111
+ "clean_text": "Acute RESPIRATORY failure",
112
+ "category": "case variations",
113
+ "matched": "Acute"
114
+ },
115
+ {
116
+ "clean_text": "URgent surgical intervention",
117
+ "category": "case variations",
118
+ "matched": "URgent"
119
+ },
120
+ {
121
+ "clean_text": "Emergency treatment for acute condition",
122
+ "category": "multiple matches",
123
+ "matched": "Emergency|acute"
124
+ },
125
+ {
126
+ "clean_text": "Urgent care in emergency department",
127
+ "category": "multiple matches",
128
+ "matched": "Urgent|emergency"
129
+ },
130
+ {
131
+ "clean_text": "Acute respiratory emergency",
132
+ "category": "multiple matches",
133
+ "matched": "Acute|emergency"
134
+ },
135
+ {
136
+ "clean_text": "Non-emergency situation",
137
+ "category": "partial words",
138
+ "matched": "emergency"
139
+ },
140
+ {
141
+ "clean_text": "Subacute condition",
142
+ "category": "partial words",
143
+ "matched": ""
144
+ },
145
+ {
146
+ "clean_text": "Emergency-related",
147
+ "category": "partial words",
148
+ "matched": "Emergency"
149
+ }
150
+ ]
151
+ }
dataset/keywords/treatment_keywords.txt CHANGED
@@ -1,118 +1,105 @@
1
- iv fluids
2
- Infusion Intravenous
3
- fluid resuscitation
4
- Intravenous Infusion
5
- normal saline
6
- crystalloids
7
- vasopressors
8
- Vasoconstrictor Agents
9
- Epinephrine
10
- Ondansetron
11
- Ibuprofen
12
- Morphine
13
- Lidocaine
14
  Airway Management
15
- intubation
16
- Intratracheal Intubation
17
- ventilation support
18
- Ventilators
19
- oxygen therapy
20
- Oxygen Inhalation Therapy
21
- cpap
22
- Continuous Positive Airway Pressure
23
- bipap
24
  Bi-level Positive Airway Pressure
25
- Nebulization
26
- cpr
 
27
  Cardiopulmonary Resuscitation
28
- ACLS
29
- Advanced Cardiac Life Support
30
- Defibrillation
31
  Cardioversion
32
- Blood Transfusion
33
- transfusion
34
- hemodynamic monitoring
35
- Hemodynamics
36
- central line placement
37
- Catheterization Central Venous
38
- arterial line placement
39
  Catheterization Arterial
40
- Hemostasis
41
- wound care
42
- Wound Management
43
- Suturing
44
- Suture
45
- Tourniquet
46
  compression dressing
47
- Wound Dressing
48
- splinting
49
- Splints
50
- radiologic imaging
51
- Radiography
52
- point of care ultrasound
53
- POCUS
54
- Ultrasonography Point of Care
55
- ultrasound
56
- x-ray
57
- Radiography
58
  ct scan
59
- Tomography X-Ray Computed
60
- laboratory testing
61
- Laboratory Techniques
62
- Sedation
63
- analgesia
64
- Analgesia
65
- procedural sedation
66
- Anesthesia Procedural
67
- ketamine
68
- Ketamine
69
- midazolam
70
- Midazolam
71
- supportive care
72
- Supportive Care
73
- monitoring
74
- Patient Monitoring
75
- vital signs monitoring
76
- Vital Signs
77
  icu transfer
78
- Intensive Care Units
79
- treatment
80
- Therapeutics
 
 
 
 
 
 
 
81
  manage
82
- Patient Management
83
  management
84
- Patient Management
85
- intervention
86
- Therapeutic Intervention
87
- Therapy
88
  medication
89
- Drug Therapy
90
- procedure
91
- Surgical Procedures Operative
92
- resuscitation
93
- Cardiopulmonary Resuscitation
94
- administer
95
- Drug Administration Routes
96
- dose
97
- Dosage Forms
98
  monitor
99
- Patient Monitoring
100
- Oxygen
101
- fluid
102
- Infusion Intravenous
103
- surgery
104
- Surgical Procedures
105
- antibiotic
106
- Anti-Bacterial Agents
107
- Dopamine
108
- Amiodarone
109
- levophed
110
- Norepinephrine
111
- Epinephrine
112
- Bosmin
113
- Adrenaline
114
- Insulin
115
  nitroglycerin
116
  NTG
117
- beta blocker
118
- alpha blocker
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ACLS
2
+ administer
3
+ Adrenaline
4
+ Advanced Cardiac Life Support
 
 
 
 
 
 
 
 
 
5
  Airway Management
6
+ alpha blocker
7
+ Amiodarone
8
+ analgesia
9
+ Anesthesia Procedural
10
+ Anti-Bacterial Agents
11
+ antibiotic
12
+ arterial line placement
13
+ beta blocker
 
14
  Bi-level Positive Airway Pressure
15
+ bipap
16
+ Blood Transfusion
17
+ Bosmin
18
  Cardiopulmonary Resuscitation
 
 
 
19
  Cardioversion
 
 
 
 
 
 
 
20
  Catheterization Arterial
21
+ Catheterization Central Venous
22
+ central line placement
 
 
 
 
23
  compression dressing
24
+ Computed Tomography
25
+ cpap
26
+ cpr
27
+ crystalloids
 
 
 
 
 
 
 
28
  ct scan
29
+ Defibrillation
30
+ Dopamine
31
+ Dosage Forms
32
+ dose
33
+ Drug Administration Routes
34
+ Drug Therapy
35
+ Epinephrine
36
+ fluid
37
+ fluid resuscitation
38
+ hemodynamic monitoring
39
+ Hemodynamics
40
+ Hemostasis
41
+ Ibuprofen
 
 
 
 
 
42
  icu transfer
43
+ Insulin
44
+ intervention
45
+ intubation
46
+ Intratracheal Intubation
47
+ Intravenous Infusion
48
+ iv fluids
49
+ laboratory techniques
50
+ laboratory testing
51
+ levophed
52
+ Lidocaine
53
  manage
 
54
  management
 
 
 
 
55
  medication
56
+ midazolam
 
 
 
 
 
 
 
 
57
  monitor
58
+ monitoring
59
+ Morphine
60
+ Nebulization
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  nitroglycerin
62
  NTG
63
+ Norepinephrine
64
+ normal saline
65
+ Ondansetron
66
+ Oxygen
67
+ Oxygen Inhalation Therapy
68
+ oxygen therapy
69
+ Patient Management
70
+ Patient Monitoring
71
+ POCUS
72
+ point of care ultrasound
73
+ procedural sedation
74
+ procedure
75
+ radiologic imaging
76
+ Radiography
77
+ resuscitation
78
+ Sedation
79
+ splinting
80
+ Splints
81
+ supportive care
82
+ surgical procedures
83
+ Surgical Procedures Operative
84
+ surgery
85
+ Suture
86
+ Suturing
87
+ Therapeutic Intervention
88
+ Therapeutics
89
+ Therapy
90
+ tourniquet
91
+ transfusion
92
+ treat
93
+ treatment
94
+ Ultrasonography Point of Care
95
+ ultrasound
96
+ Vasoconstrictor Agents
97
+ vasopressors
98
+ ventilation support
99
+ Ventilators
100
+ Vital Signs
101
+ vital signs monitoring
102
+ wound care
103
+ Wound Dressing
104
+ Wound Management
105
+ X-Ray
dataset/scripts/02_filter_treatment.py CHANGED
@@ -4,31 +4,60 @@ import os
4
  import re
5
  import pandas as pd
6
 
7
- # Function: Load keywords and print progress
8
- def load_keywords(path):
9
- print(f"📥 Loading keywords from: {path}")
10
- with open(path, "r", encoding="utf-8") as f:
11
- kws = [line.strip() for line in f if line.strip()]
12
- print(f" Loaded {len(kws)} keywords")
13
- return kws
14
-
15
- # Step 1: Load emergency subset
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  print("1️⃣ Reading emergency subset...")
17
  emergency_path = "../dataset/emergency/emergency_subset.jsonl"
18
  df = pd.read_json(emergency_path, lines=True)
19
  print(f" Loaded {len(df)} emergency records")
 
 
 
 
 
 
20
 
21
- # Step 2: Load and apply treatment keywords
22
- print("2️⃣ Loading treatment keywords and filtering...")
23
- treatment_keywords = load_keywords("../keywords/treatment_keywords.txt")
24
- pattern = r"\b(?:" + "|".join(treatment_keywords) + r")\b"
25
 
26
- # Match treatment keywords and add metadata
 
27
  df["treatment_matched"] = (
28
- df["clean_text"]
29
- .fillna("")
30
- .str.findall(pattern, flags=re.IGNORECASE)
31
- .apply(lambda lst: "|".join(lst) if lst else "")
32
  )
33
  df["has_treatment"] = df["treatment_matched"].str.len() > 0
34
 
@@ -36,14 +65,39 @@ df["has_treatment"] = df["treatment_matched"].str.len() > 0
36
  df["type"] = "treatment" # Document type identifier
37
  df["condition"] = "" # Reserved for future condition mapping
38
 
 
 
 
 
 
 
39
  cnt_treat = df["has_treatment"].sum()
40
- print(f" Matched {cnt_treat} records with treatment information")
 
 
 
 
 
41
 
42
- # Step 3: Save treatment subset
43
- print("3️⃣ Saving treatment subset...")
 
 
 
44
  out_dir = "../dataset/emergency_treatment"
45
  os.makedirs(out_dir, exist_ok=True)
46
- subset = df[df["has_treatment"]]
 
 
 
 
 
 
 
 
47
  subset.to_json(f"{out_dir}/emergency_treatment_subset.jsonl", orient="records", lines=True)
48
  subset.to_csv(f"{out_dir}/emergency_treatment_subset.csv", index=False)
49
- print(f"✅ Complete! Generated treatment subset with {len(subset)} records, saved in `{out_dir}`")
 
 
 
 
4
  import re
5
  import pandas as pd
6
 
7
+ def preprocess_keywords(keywords_file):
8
+ """Load and preprocess treatment keywords"""
9
+ print(f"📥 Loading keywords from: {keywords_file}")
10
+
11
+ # Special medical terms with common variants
12
+ special_terms = {
13
+ 'x-ray': ['x-ray', 'x ray', 'xray'],
14
+ 'ct-scan': ['ct-scan', 'ct scan', 'ctscan'],
15
+ 'point-of-care': ['point-of-care', 'point of care']
16
+ }
17
+
18
+ # Read and preprocess keywords
19
+ with open(keywords_file, "r", encoding="utf-8") as f:
20
+ keywords = [line.strip().lower() for line in f if line.strip()]
21
+
22
+ # Process keywords and handle special terms
23
+ processed_keywords = []
24
+ for kw in keywords:
25
+ if kw in special_terms:
26
+ processed_keywords.extend(special_terms[kw])
27
+ else:
28
+ processed_keywords.append(kw)
29
+
30
+ print(f" Loaded {len(keywords)} base keywords")
31
+ print(f" Processed into {len(processed_keywords)} keyword variants")
32
+ return processed_keywords
33
+
34
+ def create_regex_pattern(keywords):
35
+ """Create compiled regex pattern with word boundaries"""
36
+ pattern = r"\b(?:" + "|".join(map(re.escape, keywords)) + r")\b"
37
+ return re.compile(pattern, re.IGNORECASE)
38
+
39
+ # Step 1: Read source data
40
  print("1️⃣ Reading emergency subset...")
41
  emergency_path = "../dataset/emergency/emergency_subset.jsonl"
42
  df = pd.read_json(emergency_path, lines=True)
43
  print(f" Loaded {len(df)} emergency records")
44
+ print(f" Contains emergency keywords in 'matched' column")
45
+
46
+ # Step 2: Load treatment keywords and match
47
+ print("2️⃣ Loading treatment keywords and matching...")
48
+ treatment_keywords = preprocess_keywords("../keywords/treatment_keywords.txt")
49
+ pattern = create_regex_pattern(treatment_keywords)
50
 
51
+ # Step 3: Process text and match keywords
52
+ print("3️⃣ Processing text and matching keywords...")
53
+ # Create lowercase version of text for matching
54
+ df['clean_text_lower'] = df['clean_text'].fillna('').str.lower()
55
 
56
+ # Match treatment keywords and add metadata columns
57
+ # Note: Preserving original 'matched' column from emergency subset
58
  df["treatment_matched"] = (
59
+ df["clean_text_lower"]
60
+ .apply(lambda text: "|".join(pattern.findall(text)) or "")
 
 
61
  )
62
  df["has_treatment"] = df["treatment_matched"].str.len() > 0
63
 
 
65
  df["type"] = "treatment" # Document type identifier
66
  df["condition"] = "" # Reserved for future condition mapping
67
 
68
+ # Verify columns
69
+ print(" Verifying columns...")
70
+ print(f" - Emergency keywords column (matched): {df['matched'].notna().sum()} records")
71
+ print(f" - Treatment keywords column (treatment_matched): {df['treatment_matched'].notna().sum()} records")
72
+
73
+ # Calculate statistics
74
  cnt_treat = df["has_treatment"].sum()
75
+ avg_matches = (
76
+ df[df["has_treatment"]]["treatment_matched"]
77
+ .str.count(r"\|")
78
+ .add(1)
79
+ .mean()
80
+ )
81
 
82
+ print(f" Found {cnt_treat} treatment-related records")
83
+ print(f" Average treatment keywords per record: {avg_matches:.2f}")
84
+
85
+ # Step 4: Save treatment subset
86
+ print("4️⃣ Saving treatment subset...")
87
  out_dir = "../dataset/emergency_treatment"
88
  os.makedirs(out_dir, exist_ok=True)
89
+
90
+ # Select records with treatment keywords
91
+ subset = df[df["has_treatment"]].copy() # Use copy to avoid SettingWithCopyWarning
92
+
93
+ # Verify final subset columns
94
+ print(" Final subset columns:")
95
+ print(f" - Emergency keywords (matched): {subset['matched'].notna().sum()} records")
96
+ print(f" - Treatment keywords (treatment_matched): {subset['treatment_matched'].notna().sum()} records")
97
+
98
  subset.to_json(f"{out_dir}/emergency_treatment_subset.jsonl", orient="records", lines=True)
99
  subset.to_csv(f"{out_dir}/emergency_treatment_subset.csv", index=False)
100
+
101
+ print(f"✅ Generated treatment subset with {len(subset)} records")
102
+ print(f" Saved in: {out_dir}")
103
+ print(f" Contains both emergency and treatment keywords")
dataset/scripts/check_subset_integrity.py ADDED
@@ -0,0 +1,178 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # /scripts/check_subset_integrity.py
3
+
4
+ import pandas as pd
5
+ import json
6
+ from pathlib import Path
7
+ from tqdm import tqdm
8
+
9
+ def check_subset_sample(file_path, sample_size=100):
10
+ """
11
+ Check the first N rows of the subset file
12
+ """
13
+ print(f"\n{'='*60}")
14
+ print(f"📊 Sampling Analysis (first {sample_size} rows)")
15
+ print(f"{'='*60}")
16
+
17
+ # Read sample
18
+ print(f"\n1️⃣ Reading sample from: {file_path}")
19
+ sample_df = pd.read_csv(file_path, nrows=sample_size)
20
+
21
+ # Basic information
22
+ print("\n2️⃣ Basic Information:")
23
+ print(f" Columns present: {', '.join(sample_df.columns.tolist())}")
24
+
25
+ # Check matched columns
26
+ print("\n3️⃣ Matched Columns Status:")
27
+ matched_stats = {
28
+ 'matched': {
29
+ 'non_null': int(sample_df['matched'].notna().sum()),
30
+ 'non_empty': int((sample_df['matched'].str.len() > 0).sum()),
31
+ 'unique_values': sample_df['matched'].nunique()
32
+ },
33
+ 'treatment_matched': {
34
+ 'non_null': int(sample_df['treatment_matched'].notna().sum()),
35
+ 'non_empty': int((sample_df['treatment_matched'].str.len() > 0).sum()),
36
+ 'unique_values': sample_df['treatment_matched'].nunique()
37
+ }
38
+ }
39
+
40
+ for col, stats in matched_stats.items():
41
+ print(f"\n {col}:")
42
+ print(f" - Non-null count: {stats['non_null']}/{sample_size}")
43
+ print(f" - Non-empty count: {stats['non_empty']}/{sample_size}")
44
+ print(f" - Unique values: {stats['unique_values']}")
45
+
46
+ # Sample rows with both matches
47
+ print("\n4️⃣ Sample Rows with Both Matches:")
48
+ both_matched = sample_df[
49
+ (sample_df['matched'].notna() & sample_df['matched'].str.len() > 0) &
50
+ (sample_df['treatment_matched'].notna() & sample_df['treatment_matched'].str.len() > 0)
51
+ ].head(3)
52
+
53
+ for idx, row in both_matched.iterrows():
54
+ print(f"\n Row {idx}:")
55
+ print(f" - Emergency keywords: {row['matched']}")
56
+ print(f" - Treatment keywords: {row['treatment_matched']}")
57
+
58
+ return matched_stats
59
+
60
+ def analyze_large_file(file_path, chunk_size=1000):
61
+ """
62
+ Analyze the entire file in chunks
63
+ """
64
+ print(f"\n{'='*60}")
65
+ print(f"📈 Full File Analysis (chunk size: {chunk_size})")
66
+ print(f"{'='*60}")
67
+
68
+ stats = {
69
+ 'total_rows': 0,
70
+ 'matched_stats': {
71
+ 'non_null': 0,
72
+ 'non_empty': 0
73
+ },
74
+ 'treatment_matched_stats': {
75
+ 'non_null': 0,
76
+ 'non_empty': 0
77
+ },
78
+ 'both_matched': 0
79
+ }
80
+
81
+ print("\n1️⃣ Processing file in chunks...")
82
+ chunks = pd.read_csv(file_path, chunksize=chunk_size)
83
+
84
+ for chunk in tqdm(chunks, desc="Analyzing chunks"):
85
+ # Update total rows
86
+ stats['total_rows'] += len(chunk)
87
+
88
+ # Update matched stats
89
+ stats['matched_stats']['non_null'] += chunk['matched'].notna().sum()
90
+ stats['matched_stats']['non_empty'] += (chunk['matched'].str.len() > 0).sum()
91
+
92
+ # Update treatment_matched stats
93
+ stats['treatment_matched_stats']['non_null'] += chunk['treatment_matched'].notna().sum()
94
+ stats['treatment_matched_stats']['non_empty'] += (chunk['treatment_matched'].str.len() > 0).sum()
95
+
96
+ # Update both matched count
97
+ stats['both_matched'] += (
98
+ (chunk['matched'].notna() & chunk['matched'].str.len() > 0) &
99
+ (chunk['treatment_matched'].notna() & chunk['treatment_matched'].str.len() > 0)
100
+ ).sum()
101
+
102
+ return stats
103
+
104
+ def generate_report(sample_stats, full_stats, output_dir):
105
+ """
106
+ Generate and save analysis report
107
+ """
108
+ print(f"\n{'='*60}")
109
+ print(f"📝 Generating Report")
110
+ print(f"{'='*60}")
111
+
112
+ report = {
113
+ 'sample_analysis': sample_stats,
114
+ 'full_file_analysis': {
115
+ 'total_records': int(full_stats['total_rows']),
116
+ 'matched_column': {
117
+ 'non_null_count': int(full_stats['matched_stats']['non_null']),
118
+ 'non_empty_count': int(full_stats['matched_stats']['non_empty']),
119
+ 'null_percentage': float(
120
+ (full_stats['total_rows'] - full_stats['matched_stats']['non_null'])
121
+ / full_stats['total_rows'] * 100
122
+ )
123
+ },
124
+ 'treatment_matched_column': {
125
+ 'non_null_count': int(full_stats['treatment_matched_stats']['non_null']),
126
+ 'non_empty_count': int(full_stats['treatment_matched_stats']['non_empty']),
127
+ 'null_percentage': float(
128
+ (full_stats['total_rows'] - full_stats['treatment_matched_stats']['non_null'])
129
+ / full_stats['total_rows'] * 100
130
+ )
131
+ },
132
+ 'both_matched_count': int(full_stats['both_matched']),
133
+ 'both_matched_percentage': float(
134
+ full_stats['both_matched'] / full_stats['total_rows'] * 100
135
+ )
136
+ }
137
+ }
138
+
139
+ # Create output directory
140
+ output_dir = Path(output_dir)
141
+ output_dir.mkdir(parents=True, exist_ok=True)
142
+
143
+ # Save report
144
+ report_file = output_dir / 'integrity_check_report.json'
145
+ with open(report_file, 'w', encoding='utf-8') as f:
146
+ json.dump(report, f, indent=2, ensure_ascii=False)
147
+
148
+ print(f"\nReport saved to: {report_file}")
149
+
150
+ # Print summary
151
+ print("\n📊 Summary:")
152
+ print(f"Total records: {report['full_file_analysis']['total_records']}")
153
+ print(f"Records with both matches: {report['full_file_analysis']['both_matched_count']} "
154
+ f"({report['full_file_analysis']['both_matched_percentage']:.2f}%)")
155
+
156
+ return report
157
+
158
+ def main():
159
+ """
160
+ Main execution function
161
+ """
162
+ # Configuration
163
+ input_file = "../dataset/emergency_treatment/emergency_treatment_subset.csv"
164
+ output_dir = "../analysis/integrity_check"
165
+
166
+ print(f"\n🔍 Starting Subset Integrity Check")
167
+ print(f"Input file: {input_file}")
168
+ print(f"Output directory: {output_dir}")
169
+
170
+ # Run analysis
171
+ sample_stats = check_subset_sample(input_file)
172
+ full_stats = analyze_large_file(input_file)
173
+ report = generate_report(sample_stats, full_stats, output_dir)
174
+
175
+ print("\n✅ Integrity check complete!")
176
+
177
+ if __name__ == "__main__":
178
+ main()
dataset/scripts/keyword_Match_Clean_for_subset_filter.txt ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Keyword Matching and Text Cleaning Logic for Subset Filtering
2
+
3
+ ## 1. Keyword Preprocessing
4
+ ```python
5
+ def preprocess_keywords(keywords_file):
6
+ # Handle special medical term variants
7
+ special_terms = {
8
+ 'x-ray': ['x-ray', 'x ray', 'xray'],
9
+ 'ct-scan': ['ct-scan', 'ct scan', 'ctscan'],
10
+ 'point-of-care': ['point-of-care', 'point of care']
11
+ }
12
+
13
+ # Read and preprocess keywords
14
+ with open(keywords_file, "r", encoding="utf-8") as f:
15
+ keywords = [
16
+ line.strip() # Remove whitespace
17
+ .lower() # Convert to lowercase
18
+ for line in f
19
+ if line.strip()
20
+ ]
21
+
22
+ # Process special term variants
23
+ processed_keywords = []
24
+ for kw in keywords:
25
+ if kw in special_terms:
26
+ processed_keywords.extend(special_terms[kw])
27
+ else:
28
+ processed_keywords.append(kw)
29
+
30
+ return processed_keywords
31
+ ```
32
+
33
+ ## 2. Regex Pattern Processing
34
+ ```python
35
+ def create_regex_pattern(keywords):
36
+ # Simple word boundary matching
37
+ pattern = r"\b(?:" + "|".join(map(re.escape, keywords)) + r")\b"
38
+ return re.compile(pattern, re.IGNORECASE)
39
+ ```
40
+
41
+ ### Regex Pattern Explanation:
42
+ - `\b`: Word boundary matching
43
+ - `(?:...)`: Non-capturing group
44
+ - `re.escape()`: Escape special characters
45
+ - `re.IGNORECASE`: Case-insensitive matching
46
+
47
+ ## 3. Text Preprocessing and Matching
48
+ ```python
49
+ # Create lowercase version of text
50
+ df['clean_text_lower'] = df['clean_text'].fillna('').str.lower()
51
+
52
+ # Match keywords
53
+ df["treatment_matched"] = (
54
+ df["clean_text_lower"]
55
+ .apply(lambda text: "|".join(pattern.findall(text)) or "")
56
+ )
57
+ ```
58
+
59
+ ## 4. Processing Logic Details
60
+
61
+ ### 4.1 Special Term Handling Rationale
62
+ - Common variants in medical literature
63
+ - Maintain semantic consistency
64
+ - Improve matching accuracy
65
+
66
+ ### 4.2 Regex Matching Strategy
67
+ - Word boundary matching for complete terms
68
+ - Precompiled patterns for performance
69
+ - Case-insensitive matching for flexibility
70
+
71
+ ### 4.3 Text Preprocessing Steps
72
+ 1. Fill null values (fillna)
73
+ 2. Convert to lowercase (str.lower)
74
+ 3. Create dedicated lowercase column to avoid repeated conversions
75
+
76
+ ## 5. Output Format
77
+ - matched column: Pipe-separated matched keywords
78
+ - type column: Document type identifier ("emergency" or "treatment")
79
+ - condition column: Reserved for future condition mapping
80
+
81
+ ## 6. Important Considerations
82
+ 1. Regular maintenance required for special term variants
83
+ 2. Precompiled regex patterns for performance optimization
84
+ 3. Dedicated text preprocessing storage to avoid redundant computations
85
+ 4. Maintain consistent column structure between emergency and treatment subsets
dataset/scripts/test_keyword_matching.py ADDED
@@ -0,0 +1,175 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import re
3
+ from pathlib import Path
4
+ import json
5
+
6
+ def test_special_terms_matching():
7
+ """
8
+ Test special medical term matching logic
9
+ """
10
+ # Test cases for different scenarios
11
+ test_cases = {
12
+ "x-ray variants": [
13
+ "Patient needs an x-ray of the chest",
14
+ "Ordered chest xray",
15
+ "X ray shows pneumonia",
16
+ "XRAY negative"
17
+ ],
18
+ "ct-scan variants": [
19
+ "CT scan reveals nodule",
20
+ "CT-scan indicates mass",
21
+ "Requires ctscan urgently",
22
+ "CTSCAN of abdomen"
23
+ ],
24
+ "point-of-care variants": [
25
+ "Point-of-care testing needed",
26
+ "Point of care ultrasound",
27
+ "POC testing results"
28
+ ],
29
+ "mixed cases": [
30
+ "Ordered both x-ray and CT scan",
31
+ "XRAY and CTSCAN negative",
32
+ "Multiple point-of-care tests with x-ray"
33
+ ],
34
+ "negative cases": [
35
+ "No imaging mentioned",
36
+ "Regular examination only",
37
+ "Laboratory tests pending"
38
+ ]
39
+ }
40
+
41
+ # Special terms dictionary (from keyword_Match_Clean_for_subset_filter.txt)
42
+ special_terms = {
43
+ 'x-ray': ['x-ray', 'x ray', 'xray'],
44
+ 'ct-scan': ['ct-scan', 'ct scan', 'ctscan'],
45
+ 'point-of-care': ['point-of-care', 'point of care']
46
+ }
47
+
48
+ # Create test DataFrame
49
+ test_df = pd.DataFrame({
50
+ 'clean_text': [text for cases in test_cases.values() for text in cases],
51
+ 'category': [cat for cat, texts in test_cases.items() for _ in texts]
52
+ })
53
+
54
+ # Process keywords
55
+ processed_keywords = []
56
+ for term, variants in special_terms.items():
57
+ processed_keywords.extend(variants)
58
+
59
+ # Create regex pattern
60
+ pattern = r"\b(?:" + "|".join(map(re.escape, processed_keywords)) + r")\b"
61
+
62
+ # Apply matching logic
63
+ test_df['matched'] = (
64
+ test_df['clean_text']
65
+ .fillna("")
66
+ .str.findall(pattern, flags=re.IGNORECASE)
67
+ .apply(lambda lst: "|".join(lst) if lst else "")
68
+ )
69
+
70
+ return test_df
71
+
72
+ def test_basic_matching():
73
+ """
74
+ Test basic keyword matching functionality
75
+ """
76
+ # Basic test cases
77
+ test_cases = {
78
+ "simple matches": [
79
+ "Emergency treatment required",
80
+ "Acute condition observed",
81
+ "Urgent care needed"
82
+ ],
83
+ "case variations": [
84
+ "EMERGENCY situation",
85
+ "Acute RESPIRATORY failure",
86
+ "URgent surgical intervention"
87
+ ],
88
+ "multiple matches": [
89
+ "Emergency treatment for acute condition",
90
+ "Urgent care in emergency department",
91
+ "Acute respiratory emergency"
92
+ ],
93
+ "partial words": [
94
+ "Non-emergency situation",
95
+ "Subacute condition",
96
+ "Emergency-related"
97
+ ]
98
+ }
99
+
100
+ # Create test DataFrame
101
+ test_df = pd.DataFrame({
102
+ 'clean_text': [text for cases in test_cases.values() for text in cases],
103
+ 'category': [cat for cat, texts in test_cases.items() for _ in texts]
104
+ })
105
+
106
+ # Test keywords
107
+ test_keywords = ['emergency', 'acute', 'urgent']
108
+ pattern = r"\b(?:" + "|".join(map(re.escape, test_keywords)) + r")\b"
109
+
110
+ # Apply matching logic
111
+ test_df['matched'] = (
112
+ test_df['clean_text']
113
+ .fillna("")
114
+ .str.findall(pattern, flags=re.IGNORECASE)
115
+ .apply(lambda lst: "|".join(lst) if lst else "")
116
+ )
117
+
118
+ return test_df
119
+
120
+ def save_test_results(results_dict):
121
+ """
122
+ Save test results to JSON file
123
+ """
124
+ output_dir = Path("../analysis")
125
+ output_dir.mkdir(exist_ok=True)
126
+
127
+ output_file = output_dir / "keyword_matching_test_results.json"
128
+
129
+ # Convert DataFrame results to dictionary
130
+ for key, df in results_dict.items():
131
+ results_dict[key] = df.to_dict(orient='records')
132
+
133
+ with open(output_file, 'w') as f:
134
+ json.dump(results_dict, f, indent=2)
135
+
136
+ print(f"Results saved to: {output_file}")
137
+
138
+ def run_tests():
139
+ """
140
+ Run all tests and output results
141
+ """
142
+ print("🧪 Running keyword matching tests...")
143
+
144
+ # Run tests
145
+ special_terms_results = test_special_terms_matching()
146
+ basic_matching_results = test_basic_matching()
147
+
148
+ # Print results
149
+ print("\n📊 Special Terms Matching Results:")
150
+ for category in special_terms_results['category'].unique():
151
+ print(f"\n{category}:")
152
+ subset = special_terms_results[special_terms_results['category'] == category]
153
+ for _, row in subset.iterrows():
154
+ print(f"Text: {row['clean_text']}")
155
+ print(f"Matched: {row['matched'] or 'No matches'}")
156
+ print("-" * 50)
157
+
158
+ print("\n📊 Basic Matching Results:")
159
+ for category in basic_matching_results['category'].unique():
160
+ print(f"\n{category}:")
161
+ subset = basic_matching_results[basic_matching_results['category'] == category]
162
+ for _, row in subset.iterrows():
163
+ print(f"Text: {row['clean_text']}")
164
+ print(f"Matched: {row['matched'] or 'No matches'}")
165
+ print("-" * 50)
166
+
167
+ # Save results
168
+ results_dict = {
169
+ 'special_terms_matching': special_terms_results,
170
+ 'basic_matching': basic_matching_results
171
+ }
172
+ save_test_results(results_dict)
173
+
174
+ if __name__ == "__main__":
175
+ run_tests()