YanBoChen commited on
Commit
9829a46
·
1 Parent(s): 654aa66

feat: implement special_term (added) emergency keyword matching and metadata extraction

Browse files
dataset/scripts/01_filter_emergency_opt.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import pandas as pd
4
+
5
+ # Medical term processor class for handling special terms
6
+ class MedicalTermProcessor:
7
+ def __init__(self):
8
+ # Emergency special terms mapping
9
+ self.emergency_special_terms = {
10
+ # Cardiac
11
+ 'mi': ['mi', 'm.i.', 'myocardial infarction', 'MI'],
12
+ 'acs': ['acs', 'ACS', 'acute coronary syndrome'],
13
+
14
+ # Respiratory
15
+ 'ards': ['ards', 'ARDS', 'acute respiratory distress syndrome'],
16
+ 'respiratory_failure': ['respiratory failure', 'resp failure', 'RF'],
17
+
18
+ # Neurological
19
+ 'loc': ['loc', 'LOC', 'loss of consciousness'],
20
+ 'cva': ['cva', 'CVA', 'stroke', 'cerebrovascular accident'],
21
+
22
+ # Shock States
23
+ 'shock': ['shock', 'circulatory failure'],
24
+ 'septic_shock': ['septic shock', 'sepsis induced shock'],
25
+
26
+ # Bleeding
27
+ 'gi_bleed': ['gi bleed', 'gi bleeding', 'gastrointestinal hemorrhage', 'GI hemorrhage'],
28
+ 'hemorrhage': ['hemorrhage', 'bleeding', 'blood loss'],
29
+
30
+ # Vital Signs
31
+ 'hypotension': ['hypotension', 'low bp', 'low blood pressure'],
32
+ 'tachycardia': ['tachycardia', 'elevated heart rate', 'fast heart rate']
33
+ }
34
+
35
+ def get_all_variants(self):
36
+ """Get all term variants including special terms"""
37
+ variants = []
38
+ for term_list in self.emergency_special_terms.values():
39
+ variants.extend(term_list)
40
+ return variants
41
+
42
+ # Function: Load keywords and print progress
43
+ def load_keywords(path, processor):
44
+ print(f"📥 Loading keywords from: {path}")
45
+ # Load basic keywords
46
+ with open(path, "r", encoding="utf-8") as f:
47
+ basic_kws = [line.strip() for line in f if line.strip()]
48
+
49
+ # Add special term variants
50
+ special_kws = processor.get_all_variants()
51
+ all_kws = list(set(basic_kws + special_kws)) # Remove duplicates
52
+
53
+ print(f" Loaded {len(all_kws)} keywords (including variants)")
54
+ return all_kws
55
+
56
+ # Step 1: Read source data
57
+ print("1️⃣ Reading source data...")
58
+ source_path = "../dataset/guidelines_source_filtered.jsonl"
59
+ df = pd.read_json(source_path, lines=True)
60
+ print(f" Loaded {len(df)} records")
61
+
62
+ # Step 2: Load emergency keywords and match
63
+ print("2️⃣ Loading emergency keywords and matching...")
64
+ processor = MedicalTermProcessor()
65
+ keywords = load_keywords("../keywords/emergency_keywords.txt", processor)
66
+ pattern = r"\b(?:" + "|".join(map(re.escape, keywords)) + r")\b"
67
+
68
+ # Match keywords and add metadata columns
69
+ df["matched"] = (
70
+ df["clean_text"]
71
+ .fillna("") # Convert NaN to empty string
72
+ .str.findall(pattern, flags=re.IGNORECASE)
73
+ .apply(lambda lst: "|".join(lst) if lst else "")
74
+ )
75
+ df["has_emergency"] = df["matched"].str.len() > 0
76
+
77
+ # Add metadata columns for future use
78
+ df["type"] = "emergency" # Document type identifier
79
+ df["condition"] = "" # Reserved for future condition mapping
80
+
81
+ # Calculate average matches
82
+ cnt_em = df["has_emergency"].sum()
83
+ avg_matches = (
84
+ df[df["has_emergency"]]["matched"]
85
+ .str.count(r"\|") # Escape the pipe
86
+ .add(1)
87
+ .mean()
88
+ )
89
+
90
+ print(f" Matched {cnt_em} emergency-related records")
91
+ print(f" Average keywords per record: {avg_matches:.2f}")
92
+
93
+ # Step 3: Save emergency subset
94
+ print("3️⃣ Saving emergency subset...")
95
+ out_dir = "../dataset/emergency"
96
+ os.makedirs(out_dir, exist_ok=True)
97
+ subset = df[df["has_emergency"]]
98
+
99
+ # Save with _opt suffix to distinguish from original files
100
+ subset.to_json(f"{out_dir}/emergency_subset_opt.jsonl", orient="records", lines=True)
101
+ subset.to_csv(f"{out_dir}/emergency_subset_opt.csv", index=False)
102
+ print(f"✅ Complete! Generated emergency subset with {len(subset)} records, saved in `{out_dir}` with _opt suffix")