Spaces:
Sleeping
Sleeping
YanBoChen
commited on
Commit
·
9829a46
1
Parent(s):
654aa66
feat: implement special_term (added) emergency keyword matching and metadata extraction
Browse files
dataset/scripts/01_filter_emergency_opt.py
ADDED
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import re
|
3 |
+
import pandas as pd
|
4 |
+
|
5 |
+
# Medical term processor class for handling special terms
|
6 |
+
class MedicalTermProcessor:
|
7 |
+
def __init__(self):
|
8 |
+
# Emergency special terms mapping
|
9 |
+
self.emergency_special_terms = {
|
10 |
+
# Cardiac
|
11 |
+
'mi': ['mi', 'm.i.', 'myocardial infarction', 'MI'],
|
12 |
+
'acs': ['acs', 'ACS', 'acute coronary syndrome'],
|
13 |
+
|
14 |
+
# Respiratory
|
15 |
+
'ards': ['ards', 'ARDS', 'acute respiratory distress syndrome'],
|
16 |
+
'respiratory_failure': ['respiratory failure', 'resp failure', 'RF'],
|
17 |
+
|
18 |
+
# Neurological
|
19 |
+
'loc': ['loc', 'LOC', 'loss of consciousness'],
|
20 |
+
'cva': ['cva', 'CVA', 'stroke', 'cerebrovascular accident'],
|
21 |
+
|
22 |
+
# Shock States
|
23 |
+
'shock': ['shock', 'circulatory failure'],
|
24 |
+
'septic_shock': ['septic shock', 'sepsis induced shock'],
|
25 |
+
|
26 |
+
# Bleeding
|
27 |
+
'gi_bleed': ['gi bleed', 'gi bleeding', 'gastrointestinal hemorrhage', 'GI hemorrhage'],
|
28 |
+
'hemorrhage': ['hemorrhage', 'bleeding', 'blood loss'],
|
29 |
+
|
30 |
+
# Vital Signs
|
31 |
+
'hypotension': ['hypotension', 'low bp', 'low blood pressure'],
|
32 |
+
'tachycardia': ['tachycardia', 'elevated heart rate', 'fast heart rate']
|
33 |
+
}
|
34 |
+
|
35 |
+
def get_all_variants(self):
|
36 |
+
"""Get all term variants including special terms"""
|
37 |
+
variants = []
|
38 |
+
for term_list in self.emergency_special_terms.values():
|
39 |
+
variants.extend(term_list)
|
40 |
+
return variants
|
41 |
+
|
42 |
+
# Function: Load keywords and print progress
|
43 |
+
def load_keywords(path, processor):
|
44 |
+
print(f"📥 Loading keywords from: {path}")
|
45 |
+
# Load basic keywords
|
46 |
+
with open(path, "r", encoding="utf-8") as f:
|
47 |
+
basic_kws = [line.strip() for line in f if line.strip()]
|
48 |
+
|
49 |
+
# Add special term variants
|
50 |
+
special_kws = processor.get_all_variants()
|
51 |
+
all_kws = list(set(basic_kws + special_kws)) # Remove duplicates
|
52 |
+
|
53 |
+
print(f" Loaded {len(all_kws)} keywords (including variants)")
|
54 |
+
return all_kws
|
55 |
+
|
56 |
+
# Step 1: Read source data
|
57 |
+
print("1️⃣ Reading source data...")
|
58 |
+
source_path = "../dataset/guidelines_source_filtered.jsonl"
|
59 |
+
df = pd.read_json(source_path, lines=True)
|
60 |
+
print(f" Loaded {len(df)} records")
|
61 |
+
|
62 |
+
# Step 2: Load emergency keywords and match
|
63 |
+
print("2️⃣ Loading emergency keywords and matching...")
|
64 |
+
processor = MedicalTermProcessor()
|
65 |
+
keywords = load_keywords("../keywords/emergency_keywords.txt", processor)
|
66 |
+
pattern = r"\b(?:" + "|".join(map(re.escape, keywords)) + r")\b"
|
67 |
+
|
68 |
+
# Match keywords and add metadata columns
|
69 |
+
df["matched"] = (
|
70 |
+
df["clean_text"]
|
71 |
+
.fillna("") # Convert NaN to empty string
|
72 |
+
.str.findall(pattern, flags=re.IGNORECASE)
|
73 |
+
.apply(lambda lst: "|".join(lst) if lst else "")
|
74 |
+
)
|
75 |
+
df["has_emergency"] = df["matched"].str.len() > 0
|
76 |
+
|
77 |
+
# Add metadata columns for future use
|
78 |
+
df["type"] = "emergency" # Document type identifier
|
79 |
+
df["condition"] = "" # Reserved for future condition mapping
|
80 |
+
|
81 |
+
# Calculate average matches
|
82 |
+
cnt_em = df["has_emergency"].sum()
|
83 |
+
avg_matches = (
|
84 |
+
df[df["has_emergency"]]["matched"]
|
85 |
+
.str.count(r"\|") # Escape the pipe
|
86 |
+
.add(1)
|
87 |
+
.mean()
|
88 |
+
)
|
89 |
+
|
90 |
+
print(f" Matched {cnt_em} emergency-related records")
|
91 |
+
print(f" Average keywords per record: {avg_matches:.2f}")
|
92 |
+
|
93 |
+
# Step 3: Save emergency subset
|
94 |
+
print("3️⃣ Saving emergency subset...")
|
95 |
+
out_dir = "../dataset/emergency"
|
96 |
+
os.makedirs(out_dir, exist_ok=True)
|
97 |
+
subset = df[df["has_emergency"]]
|
98 |
+
|
99 |
+
# Save with _opt suffix to distinguish from original files
|
100 |
+
subset.to_json(f"{out_dir}/emergency_subset_opt.jsonl", orient="records", lines=True)
|
101 |
+
subset.to_csv(f"{out_dir}/emergency_subset_opt.csv", index=False)
|
102 |
+
print(f"✅ Complete! Generated emergency subset with {len(subset)} records, saved in `{out_dir}` with _opt suffix")
|