YanBoChen commited on
Commit
7d8970e
·
1 Parent(s): ee06c0f

WIP: Try to analysis treatment_subset enhance emergency and treatment filtering scripts with metadata and analysis functionality

Browse files
dataset/scripts/01_filter_emergency.py CHANGED
@@ -23,7 +23,7 @@ print("2️⃣ Loading emergency keywords and matching...")
23
  keywords = load_keywords("../keywords/emergency_keywords.txt")
24
  pattern = r"\b(?:" + "|".join(keywords) + r")\b" # Using non-capturing groups (?:...)
25
 
26
- # Match keywords
27
  df["matched"] = (
28
  df["clean_text"]
29
  .fillna("") # Convert NaN to empty string
@@ -31,9 +31,13 @@ df["matched"] = (
31
  .apply(lambda lst: "|".join(lst) if lst else "")
32
  )
33
  df["has_emergency"] = df["matched"].str.len() > 0
34
- cnt_em = df["has_emergency"].sum()
35
 
36
- # Calculate average matches (with escape)
 
 
 
 
 
37
  avg_matches = (
38
  df[df["has_emergency"]]["matched"]
39
  .str.count(r"\|") # Escape the pipe
 
23
  keywords = load_keywords("../keywords/emergency_keywords.txt")
24
  pattern = r"\b(?:" + "|".join(keywords) + r")\b" # Using non-capturing groups (?:...)
25
 
26
+ # Match keywords and add metadata columns
27
  df["matched"] = (
28
  df["clean_text"]
29
  .fillna("") # Convert NaN to empty string
 
31
  .apply(lambda lst: "|".join(lst) if lst else "")
32
  )
33
  df["has_emergency"] = df["matched"].str.len() > 0
 
34
 
35
+ # Add metadata columns for future use
36
+ df["type"] = "emergency" # Document type identifier
37
+ df["condition"] = "" # Reserved for future condition mapping
38
+
39
+ # Calculate average matches
40
+ cnt_em = df["has_emergency"].sum()
41
  avg_matches = (
42
  df[df["has_emergency"]]["matched"]
43
  .str.count(r"\|") # Escape the pipe
dataset/scripts/02_filter_treatment.py CHANGED
@@ -1,37 +1,49 @@
1
  # scripts/02_filter_treatment.py
2
 
3
  import os
 
4
  import pandas as pd
5
 
6
- # 工具函数:载入关键字
7
  def load_keywords(path):
8
- print(f"📥 载入关键字:{path}")
9
- with open(path, "r") as f:
10
  kws = [line.strip() for line in f if line.strip()]
11
- print(f" 共载入 {len(kws)} 个关键字")
12
  return kws
13
 
14
- # Step 1: 载入急症子集
15
- print("1️⃣ 读取急症子集…")
16
  emergency_path = "../dataset/emergency/emergency_subset.jsonl"
17
  df = pd.read_json(emergency_path, lines=True)
18
- print(f" 已读取 {len(df)} 条急症相关记录")
19
 
20
- # Step 2: 载入处置/管理关键字并过滤
21
- print("2️⃣ 读取处置/管理关键字并开始过滤…")
22
  treatment_keywords = load_keywords("../keywords/treatment_keywords.txt")
23
- pattern2 = "|".join(treatment_keywords)
24
- df["has_treatment"] = df["clean_text"].str.contains(pattern2, case=False, na=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  cnt_treat = df["has_treatment"].sum()
26
- print(f" 匹配到 {cnt_treat} 条包含处置/管理描述的记录")
27
 
28
- # Step 3: 保存急症+处置子集
29
- print("3️⃣ 保存急症+处置子集…")
30
  out_dir = "../dataset/emergency_treatment"
31
  os.makedirs(out_dir, exist_ok=True)
32
- subset2 = df[df["has_treatment"]]
33
- subset2.to_json(f"{out_dir}/emergency_treatment_subset.jsonl", orient="records", lines=True)
34
- subset2.to_csv(f"{out_dir}/emergency_treatment_subset.csv", index=False)
35
- print(f" 已保存 {len(subset2)} 条记录到 `{out_dir}`")
36
-
37
- print("✅ 完成!急症+处置子集已生成。")
 
1
  # scripts/02_filter_treatment.py
2
 
3
  import os
4
+ import re
5
  import pandas as pd
6
 
7
+ # Function: Load keywords and print progress
8
  def load_keywords(path):
9
+ print(f"📥 Loading keywords from: {path}")
10
+ with open(path, "r", encoding="utf-8") as f:
11
  kws = [line.strip() for line in f if line.strip()]
12
+ print(f" Loaded {len(kws)} keywords")
13
  return kws
14
 
15
+ # Step 1: Load emergency subset
16
+ print("1️⃣ Reading emergency subset...")
17
  emergency_path = "../dataset/emergency/emergency_subset.jsonl"
18
  df = pd.read_json(emergency_path, lines=True)
19
+ print(f" Loaded {len(df)} emergency records")
20
 
21
+ # Step 2: Load and apply treatment keywords
22
+ print("2️⃣ Loading treatment keywords and filtering...")
23
  treatment_keywords = load_keywords("../keywords/treatment_keywords.txt")
24
+ pattern = r"\b(?:" + "|".join(treatment_keywords) + r")\b"
25
+
26
+ # Match treatment keywords and add metadata
27
+ df["treatment_matched"] = (
28
+ df["clean_text"]
29
+ .fillna("")
30
+ .str.findall(pattern, flags=re.IGNORECASE)
31
+ .apply(lambda lst: "|".join(lst) if lst else "")
32
+ )
33
+ df["has_treatment"] = df["treatment_matched"].str.len() > 0
34
+
35
+ # Add metadata columns for future use
36
+ df["type"] = "treatment" # Document type identifier
37
+ df["condition"] = "" # Reserved for future condition mapping
38
+
39
  cnt_treat = df["has_treatment"].sum()
40
+ print(f" Matched {cnt_treat} records with treatment information")
41
 
42
+ # Step 3: Save treatment subset
43
+ print("3️⃣ Saving treatment subset...")
44
  out_dir = "../dataset/emergency_treatment"
45
  os.makedirs(out_dir, exist_ok=True)
46
+ subset = df[df["has_treatment"]]
47
+ subset.to_json(f"{out_dir}/emergency_treatment_subset.jsonl", orient="records", lines=True)
48
+ subset.to_csv(f"{out_dir}/emergency_treatment_subset.csv", index=False)
49
+ print(f" Complete! Generated treatment subset with {len(subset)} records, saved in `{out_dir}`")
 
 
dataset/scripts/data_explorer_treatment.py ADDED
@@ -0,0 +1,330 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # /scripts/data_explorer_treatment.py
2
+ import pandas as pd
3
+ import matplotlib.pyplot as plt
4
+ import seaborn as sns
5
+ import numpy as np
6
+ from pathlib import Path
7
+ import json
8
+ import numpy as np
9
+ from tqdm import tqdm
10
+ import re
11
+
12
+ def analyze_treatment_subset(
13
+ treatment_file_path,
14
+ emergency_keywords_path,
15
+ treatment_keywords_path,
16
+ output_dir="analysis_treatment"
17
+ ):
18
+ """
19
+ Specialized analysis for treatment subset focusing on:
20
+ 1. Dual keyword analysis (emergency + treatment)
21
+ 2. Path B effectiveness validation
22
+ 3. Condition mapping data preparation
23
+ 4. RAG readiness assessment
24
+ """
25
+ print(f"\n{'='*60}")
26
+ print(f"Treatment Subset Analysis")
27
+ print(f"Treatment file: {treatment_file_path}")
28
+ print(f"Emergency keywords: {emergency_keywords_path}")
29
+ print(f"Treatment keywords: {treatment_keywords_path}")
30
+ print(f"Output directory: {output_dir}")
31
+ print(f"{'='*60}\n")
32
+
33
+ # Load data
34
+ print("1️⃣ Loading treatment subset data...")
35
+ df = pd.read_csv(treatment_file_path)
36
+ output_dir = Path(output_dir)
37
+
38
+ # Load keyword lists
39
+ print("2️⃣ Loading keyword lists...")
40
+ with open(emergency_keywords_path, 'r', encoding='utf-8') as f:
41
+ emergency_keywords = [line.strip() for line in f if line.strip()]
42
+
43
+ with open(treatment_keywords_path, 'r', encoding='utf-8') as f:
44
+ treatment_keywords = [line.strip() for line in f if line.strip()]
45
+
46
+ print(f" Emergency keywords: {len(emergency_keywords)}")
47
+ print(f" Treatment keywords: {len(treatment_keywords)}")
48
+
49
+ # Basic statistics
50
+ print("\n3️⃣ Computing basic statistics...")
51
+ total_records = len(df)
52
+ df['text_length'] = df['clean_text'].str.len()
53
+ avg_length = df['text_length'].mean()
54
+
55
+ print(f" Total treatment records: {total_records}")
56
+ print(f" Average text length: {avg_length:.2f} characters")
57
+
58
+ # Initialize comprehensive statistics
59
+ stats = {
60
+ 'basic_statistics': {
61
+ 'total_records': int(total_records),
62
+ 'avg_text_length': float(avg_length),
63
+ 'emergency_keywords_count': len(emergency_keywords),
64
+ 'treatment_keywords_count': len(treatment_keywords)
65
+ },
66
+ 'emergency_keyword_stats': {},
67
+ 'treatment_keyword_stats': {},
68
+ 'cooccurrence_analysis': {},
69
+ 'path_b_validation': {},
70
+ 'condition_mapping_candidates': {}
71
+ }
72
+
73
+ # Emergency keyword analysis in treatment subset
74
+ print("\n4️⃣ Analyzing emergency keywords in treatment subset...")
75
+ for keyword in emergency_keywords:
76
+ count = df['clean_text'].str.contains(keyword, case=False, na=False).sum()
77
+ stats['emergency_keyword_stats'][keyword] = int(count)
78
+ print(f" Emergency: {keyword} -> {count} records")
79
+
80
+ # Treatment keyword analysis
81
+ print("\n5️⃣ Analyzing treatment keywords...")
82
+ for keyword in treatment_keywords:
83
+ count = df['clean_text'].str.contains(keyword, case=False, na=False).sum()
84
+ stats['treatment_keyword_stats'][keyword] = int(count)
85
+ print(f" Treatment: {keyword} -> {count} records")
86
+
87
+ # Co-occurrence analysis
88
+ print("\n6️⃣ Computing keyword co-occurrence patterns...")
89
+ print(" Creating boolean matrices...")
90
+
91
+ # Initialize boolean matrices
92
+ emergency_matrix = np.zeros((len(df), len(emergency_keywords)), dtype=bool)
93
+ treatment_matrix = np.zeros((len(df), len(treatment_keywords)), dtype=bool)
94
+
95
+ # Pre-process text
96
+ print(" Pre-processing text...")
97
+ df['clean_text_lower'] = df['clean_text'].fillna('').str.lower()
98
+
99
+ # Fill emergency matrix with progress bar
100
+ print(" Processing emergency keywords...")
101
+ for i, keyword in enumerate(tqdm(emergency_keywords, desc="Emergency keywords")):
102
+ pattern = r'\b' + re.escape(keyword) + r'\b'
103
+ try:
104
+ emergency_matrix[:, i] = df['clean_text_lower'].str.contains(
105
+ pattern,
106
+ regex=True,
107
+ na=False
108
+ ).values
109
+ except Exception as e:
110
+ print(f" Warning: Error processing keyword '{keyword}': {str(e)}")
111
+
112
+ # Fill treatment matrix with progress bar
113
+ print(" Processing treatment keywords...")
114
+ for i, keyword in enumerate(tqdm(treatment_keywords, desc="Treatment keywords")):
115
+ pattern = r'\b' + re.escape(keyword) + r'\b'
116
+ try:
117
+ treatment_matrix[:, i] = df['clean_text_lower'].str.contains(
118
+ pattern,
119
+ regex=True,
120
+ na=False
121
+ ).values
122
+ except Exception as e:
123
+ print(f" Warning: Error processing keyword '{keyword}': {str(e)}")
124
+
125
+ # Compute co-occurrence using matrix multiplication
126
+ print(" Computing co-occurrence matrix...")
127
+ cooc_matrix = emergency_matrix.T @ treatment_matrix
128
+
129
+ # Extract results
130
+ print(" Extracting co-occurrence pairs...")
131
+ cooccurrence_pairs = []
132
+ for i, em_kw in enumerate(emergency_keywords):
133
+ for j, tr_kw in enumerate(treatment_keywords):
134
+ count = int(cooc_matrix[i, j])
135
+ if count > 0:
136
+ cooccurrence_pairs.append({
137
+ 'emergency_keyword': em_kw,
138
+ 'treatment_keyword': tr_kw,
139
+ 'cooccurrence_count': count,
140
+ 'percentage': float(count / total_records * 100)
141
+ })
142
+
143
+ # Sort and store results
144
+ cooccurrence_pairs.sort(key=lambda x: x['cooccurrence_count'], reverse=True)
145
+ stats['cooccurrence_analysis'] = cooccurrence_pairs[:20] # Top 20 pairs
146
+
147
+ print(f" Found {len(cooccurrence_pairs)} co-occurrence pairs")
148
+ print(" Top 5 co-occurrence pairs:")
149
+ for i, pair in enumerate(cooccurrence_pairs[:5]):
150
+ print(f" {i+1}. {pair['emergency_keyword']} + {pair['treatment_keyword']}: {pair['cooccurrence_count']} ({pair['percentage']:.1f}%)")
151
+
152
+ # Path B validation metrics
153
+ print("\n7️⃣ Validating Path B strategy effectiveness...")
154
+
155
+ # Calculate keyword density
156
+ emergency_density = []
157
+ treatment_density = []
158
+
159
+ for _, row in df.iterrows():
160
+ text = str(row['clean_text']).lower()
161
+ em_matches = sum(1 for kw in emergency_keywords if kw.lower() in text)
162
+ tr_matches = sum(1 for kw in treatment_keywords if kw.lower() in text)
163
+
164
+ emergency_density.append(em_matches)
165
+ treatment_density.append(tr_matches)
166
+
167
+ df['emergency_keyword_density'] = emergency_density
168
+ df['treatment_keyword_density'] = treatment_density
169
+
170
+ stats['path_b_validation'] = {
171
+ 'avg_emergency_density': float(np.mean(emergency_density)),
172
+ 'avg_treatment_density': float(np.mean(treatment_density)),
173
+ 'high_density_records': int(sum(1 for ed, td in zip(emergency_density, treatment_density) if ed >= 2 and td >= 2)),
174
+ 'precision_estimate': float(sum(1 for ed, td in zip(emergency_density, treatment_density) if ed >= 1 and td >= 1) / total_records)
175
+ }
176
+
177
+ print(f" Average emergency keyword density: {stats['path_b_validation']['avg_emergency_density']:.2f}")
178
+ print(f" Average treatment keyword density: {stats['path_b_validation']['avg_treatment_density']:.2f}")
179
+ print(f" High-density records (≥2 each): {stats['path_b_validation']['high_density_records']}")
180
+ print(f" Precision estimate: {stats['path_b_validation']['precision_estimate']:.2f}")
181
+
182
+ # Condition mapping candidates
183
+ print("\n8️⃣ Preparing condition mapping candidates...")
184
+
185
+ # Group emergency keywords by potential conditions
186
+ condition_candidates = {}
187
+ for pair in cooccurrence_pairs[:10]: # Top 10 pairs
188
+ em_kw = pair['emergency_keyword']
189
+ tr_kw = pair['treatment_keyword']
190
+
191
+ # Simple condition inference (can be enhanced later)
192
+ if any(cardiac_term in em_kw.lower() for cardiac_term in ['mi', 'cardiac', 'heart', 'chest']):
193
+ condition = 'cardiac'
194
+ elif any(resp_term in em_kw.lower() for resp_term in ['respiratory', 'breathing', 'lung', 'dyspnea']):
195
+ condition = 'respiratory'
196
+ elif any(neuro_term in em_kw.lower() for neuro_term in ['stroke', 'seizure', 'consciousness']):
197
+ condition = 'neurological'
198
+ else:
199
+ condition = 'general'
200
+
201
+ if condition not in condition_candidates:
202
+ condition_candidates[condition] = []
203
+
204
+ condition_candidates[condition].append({
205
+ 'emergency_keyword': em_kw,
206
+ 'treatment_keyword': tr_kw,
207
+ 'strength': pair['cooccurrence_count']
208
+ })
209
+
210
+ stats['condition_mapping_candidates'] = condition_candidates
211
+
212
+ # Visualization
213
+ print("\n9️⃣ Generating visualizations...")
214
+ output_plots = output_dir / "plots"
215
+ output_plots.mkdir(parents=True, exist_ok=True)
216
+
217
+ # 1. Dual keyword distribution
218
+ fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 8))
219
+
220
+ # Emergency keywords in treatment subset
221
+ em_counts = list(stats['emergency_keyword_stats'].values())
222
+ em_labels = list(stats['emergency_keyword_stats'].keys())
223
+ ax1.bar(range(len(em_labels)), em_counts)
224
+ ax1.set_title('Emergency Keywords in Treatment Subset')
225
+ ax1.set_xlabel('Emergency Keywords')
226
+ ax1.set_ylabel('Document Count')
227
+ ax1.tick_params(axis='x', rotation=45, labelsize=8)
228
+ ax1.set_xticks(range(len(em_labels)))
229
+ ax1.set_xticklabels(em_labels, ha='right')
230
+
231
+ # Treatment keywords
232
+ tr_counts = list(stats['treatment_keyword_stats'].values())
233
+ tr_labels = list(stats['treatment_keyword_stats'].keys())
234
+ ax2.bar(range(len(tr_labels)), tr_counts)
235
+ ax2.set_title('Treatment Keywords Distribution')
236
+ ax2.set_xlabel('Treatment Keywords')
237
+ ax2.set_ylabel('Document Count')
238
+ ax2.tick_params(axis='x', rotation=45, labelsize=8)
239
+ ax2.set_xticks(range(len(tr_labels)))
240
+ ax2.set_xticklabels(tr_labels, ha='right')
241
+
242
+ plt.tight_layout()
243
+ plt.savefig(output_plots / "dual_keyword_distribution.png", bbox_inches='tight', dpi=300)
244
+ plt.close()
245
+
246
+ # 2. Co-occurrence heatmap (top pairs)
247
+ if len(cooccurrence_pairs) > 0:
248
+ top_pairs = cooccurrence_pairs[:15] # Top 15 for readability
249
+ cooc_matrix = np.zeros((len(set([p['emergency_keyword'] for p in top_pairs])),
250
+ len(set([p['treatment_keyword'] for p in top_pairs]))))
251
+
252
+ em_unique = list(set([p['emergency_keyword'] for p in top_pairs]))
253
+ tr_unique = list(set([p['treatment_keyword'] for p in top_pairs]))
254
+
255
+ for pair in top_pairs:
256
+ i = em_unique.index(pair['emergency_keyword'])
257
+ j = tr_unique.index(pair['treatment_keyword'])
258
+ cooc_matrix[i, j] = pair['cooccurrence_count']
259
+
260
+ plt.figure(figsize=(12, 8))
261
+ sns.heatmap(cooc_matrix,
262
+ xticklabels=tr_unique,
263
+ yticklabels=em_unique,
264
+ annot=True,
265
+ fmt='g',
266
+ cmap='YlOrRd')
267
+ plt.title('Emergency-Treatment Keywords Co-occurrence Heatmap')
268
+ plt.xlabel('Treatment Keywords')
269
+ plt.ylabel('Emergency Keywords')
270
+ plt.xticks(rotation=45, ha='right')
271
+ plt.yticks(rotation=0)
272
+ plt.tight_layout()
273
+ plt.savefig(output_plots / "cooccurrence_heatmap.png", bbox_inches='tight', dpi=300)
274
+ plt.close()
275
+
276
+ # 3. Text length distribution
277
+ plt.figure(figsize=(10, 6))
278
+ df['text_length'].hist(bins=50, alpha=0.7)
279
+ plt.title('Text Length Distribution in Treatment Subset')
280
+ plt.xlabel('Text Length (characters)')
281
+ plt.ylabel('Frequency')
282
+ plt.axvline(avg_length, color='red', linestyle='--', label=f'Average: {avg_length:.0f}')
283
+ plt.legend()
284
+ plt.savefig(output_plots / "text_length_distribution.png", bbox_inches='tight')
285
+ plt.close()
286
+
287
+ # 4. Keyword density scatter plot
288
+ plt.figure(figsize=(10, 8))
289
+ plt.scatter(df['emergency_keyword_density'], df['treatment_keyword_density'], alpha=0.6)
290
+ plt.xlabel('Emergency Keyword Density')
291
+ plt.ylabel('Treatment Keyword Density')
292
+ plt.title('Emergency vs Treatment Keyword Density')
293
+ plt.grid(True, alpha=0.3)
294
+ plt.savefig(output_plots / "keyword_density_scatter.png", bbox_inches='tight')
295
+ plt.close()
296
+
297
+ # Save comprehensive statistics
298
+ print("\n🔟 Saving analysis results...")
299
+ stats_dir = output_dir / "stats"
300
+ stats_dir.mkdir(parents=True, exist_ok=True)
301
+
302
+ with open(stats_dir / "treatment_analysis_comprehensive.json", 'w', encoding='utf-8') as f:
303
+ json.dump(stats, f, indent=2, ensure_ascii=False)
304
+
305
+ # Save co-occurrence pairs as CSV for easy review
306
+ if cooccurrence_pairs:
307
+ cooc_df = pd.DataFrame(cooccurrence_pairs)
308
+ cooc_df.to_csv(stats_dir / "cooccurrence_pairs.csv", index=False)
309
+
310
+ print(f"✅ Treatment subset analysis complete!")
311
+ print(f" Results saved to: {output_dir}")
312
+ print(f" Plots: {output_plots}")
313
+ print(f" Statistics: {stats_dir}")
314
+
315
+ return stats
316
+
317
+ if __name__ == "__main__":
318
+ # Configuration
319
+ treatment_file = "../dataset/emergency_treatment/emergency_treatment_subset.csv"
320
+ emergency_keywords = "../keywords/emergency_keywords.txt"
321
+ treatment_keywords = "../keywords/treatment_keywords.txt"
322
+ output_directory = "../analysis_treatment"
323
+
324
+ # Run analysis
325
+ results = analyze_treatment_subset(
326
+ treatment_file,
327
+ emergency_keywords,
328
+ treatment_keywords,
329
+ output_directory
330
+ )