YanBoChen commited on
Commit
a5bcfa7
·
1 Parent(s): 7d8970e

WIP: add dual keyword and text length distribution plots for treatment subset analysis

Browse files
dataset/scripts/data_explorer_treatment.py CHANGED
@@ -84,11 +84,10 @@ def analyze_treatment_subset(
84
  stats['treatment_keyword_stats'][keyword] = int(count)
85
  print(f" Treatment: {keyword} -> {count} records")
86
 
87
- # Co-occurrence analysis
88
  print("\n6️⃣ Computing keyword co-occurrence patterns...")
89
- print(" Creating boolean matrices...")
90
 
91
- # Initialize boolean matrices
92
  emergency_matrix = np.zeros((len(df), len(emergency_keywords)), dtype=bool)
93
  treatment_matrix = np.zeros((len(df), len(treatment_keywords)), dtype=bool)
94
 
@@ -96,35 +95,26 @@ def analyze_treatment_subset(
96
  print(" Pre-processing text...")
97
  df['clean_text_lower'] = df['clean_text'].fillna('').str.lower()
98
 
99
- # Fill emergency matrix with progress bar
100
- print(" Processing emergency keywords...")
101
  for i, keyword in enumerate(tqdm(emergency_keywords, desc="Emergency keywords")):
102
- pattern = r'\b' + re.escape(keyword) + r'\b'
103
- try:
104
- emergency_matrix[:, i] = df['clean_text_lower'].str.contains(
105
- pattern,
106
- regex=True,
107
- na=False
108
- ).values
109
- except Exception as e:
110
- print(f" Warning: Error processing keyword '{keyword}': {str(e)}")
111
 
112
- # Fill treatment matrix with progress bar
113
- print(" Processing treatment keywords...")
114
  for i, keyword in enumerate(tqdm(treatment_keywords, desc="Treatment keywords")):
115
- pattern = r'\b' + re.escape(keyword) + r'\b'
116
- try:
117
- treatment_matrix[:, i] = df['clean_text_lower'].str.contains(
118
- pattern,
119
- regex=True,
120
- na=False
121
- ).values
122
- except Exception as e:
123
- print(f" Warning: Error processing keyword '{keyword}': {str(e)}")
124
 
125
- # Compute co-occurrence using matrix multiplication
126
- print(" Computing co-occurrence matrix...")
127
- cooc_matrix = emergency_matrix.T @ treatment_matrix
 
128
 
129
  # Extract results
130
  print(" Extracting co-occurrence pairs...")
@@ -137,7 +127,7 @@ def analyze_treatment_subset(
137
  'emergency_keyword': em_kw,
138
  'treatment_keyword': tr_kw,
139
  'cooccurrence_count': count,
140
- 'percentage': float(count / total_records * 100)
141
  })
142
 
143
  # Sort and store results
@@ -149,35 +139,45 @@ def analyze_treatment_subset(
149
  for i, pair in enumerate(cooccurrence_pairs[:5]):
150
  print(f" {i+1}. {pair['emergency_keyword']} + {pair['treatment_keyword']}: {pair['cooccurrence_count']} ({pair['percentage']:.1f}%)")
151
 
152
- # Path B validation metrics
153
  print("\n7️⃣ Validating Path B strategy effectiveness...")
154
 
155
- # Calculate keyword density
156
- emergency_density = []
157
- treatment_density = []
158
-
159
- for _, row in df.iterrows():
160
- text = str(row['clean_text']).lower()
161
- em_matches = sum(1 for kw in emergency_keywords if kw.lower() in text)
162
- tr_matches = sum(1 for kw in treatment_keywords if kw.lower() in text)
163
-
164
- emergency_density.append(em_matches)
165
- treatment_density.append(tr_matches)
166
 
 
167
  df['emergency_keyword_density'] = emergency_density
168
  df['treatment_keyword_density'] = treatment_density
169
 
 
170
  stats['path_b_validation'] = {
171
  'avg_emergency_density': float(np.mean(emergency_density)),
172
  'avg_treatment_density': float(np.mean(treatment_density)),
173
- 'high_density_records': int(sum(1 for ed, td in zip(emergency_density, treatment_density) if ed >= 2 and td >= 2)),
174
- 'precision_estimate': float(sum(1 for ed, td in zip(emergency_density, treatment_density) if ed >= 1 and td >= 1) / total_records)
175
  }
176
 
177
- print(f" Average emergency keyword density: {stats['path_b_validation']['avg_emergency_density']:.2f}")
178
- print(f" Average treatment keyword density: {stats['path_b_validation']['avg_treatment_density']:.2f}")
179
- print(f" High-density records (≥2 each): {stats['path_b_validation']['high_density_records']}")
180
- print(f" Precision estimate: {stats['path_b_validation']['precision_estimate']:.2f}")
 
 
 
 
 
 
 
 
 
 
 
 
181
 
182
  # Condition mapping candidates
183
  print("\n8️⃣ Preparing condition mapping candidates...")
 
84
  stats['treatment_keyword_stats'][keyword] = int(count)
85
  print(f" Treatment: {keyword} -> {count} records")
86
 
87
+ # Step 6: Co-occurrence analysis
88
  print("\n6️⃣ Computing keyword co-occurrence patterns...")
 
89
 
90
+ # Initialize matrices for full dataset
91
  emergency_matrix = np.zeros((len(df), len(emergency_keywords)), dtype=bool)
92
  treatment_matrix = np.zeros((len(df), len(treatment_keywords)), dtype=bool)
93
 
 
95
  print(" Pre-processing text...")
96
  df['clean_text_lower'] = df['clean_text'].fillna('').str.lower()
97
 
98
+ # Process all emergency keywords
99
+ print("\n Processing all emergency keywords...")
100
  for i, keyword in enumerate(tqdm(emergency_keywords, desc="Emergency keywords")):
101
+ pattern = r'(?<!\w)' + re.escape(keyword) + r'(?!\w)'
102
+ emergency_matrix[:, i] = df['clean_text_lower'].str.contains(pattern, regex=True, na=False)
103
+ matches = emergency_matrix[:, i].sum()
104
+ print(f" - {keyword}: {matches} matches")
 
 
 
 
 
105
 
106
+ # Process all treatment keywords
107
+ print("\n Processing all treatment keywords...")
108
  for i, keyword in enumerate(tqdm(treatment_keywords, desc="Treatment keywords")):
109
+ pattern = r'(?<!\w)' + re.escape(keyword) + r'(?!\w)'
110
+ treatment_matrix[:, i] = df['clean_text_lower'].str.contains(pattern, regex=True, na=False)
111
+ matches = treatment_matrix[:, i].sum()
112
+ print(f" - {keyword}: {matches} matches")
 
 
 
 
 
113
 
114
+ # Compute co-occurrence matrix
115
+ print("\n Computing co-occurrence matrix...")
116
+ cooc_matrix = emergency_matrix.astype(int).T @ treatment_matrix.astype(int)
117
+ print(" Computation completed successfully")
118
 
119
  # Extract results
120
  print(" Extracting co-occurrence pairs...")
 
127
  'emergency_keyword': em_kw,
128
  'treatment_keyword': tr_kw,
129
  'cooccurrence_count': count,
130
+ 'percentage': float(count / len(df) * 100)
131
  })
132
 
133
  # Sort and store results
 
139
  for i, pair in enumerate(cooccurrence_pairs[:5]):
140
  print(f" {i+1}. {pair['emergency_keyword']} + {pair['treatment_keyword']}: {pair['cooccurrence_count']} ({pair['percentage']:.1f}%)")
141
 
142
+ # Step 7: Path B validation metrics
143
  print("\n7️⃣ Validating Path B strategy effectiveness...")
144
 
145
+ # Compute keyword density with progress bar
146
+ print(" Computing keyword density...")
147
+ with tqdm(total=2, desc="Density calculation") as pbar:
148
+ emergency_density = emergency_matrix.sum(axis=1)
149
+ pbar.update(1)
150
+ treatment_density = treatment_matrix.sum(axis=1)
151
+ pbar.update(1)
 
 
 
 
152
 
153
+ # Store density in dataframe
154
  df['emergency_keyword_density'] = emergency_density
155
  df['treatment_keyword_density'] = treatment_density
156
 
157
+ # Calculate statistics
158
  stats['path_b_validation'] = {
159
  'avg_emergency_density': float(np.mean(emergency_density)),
160
  'avg_treatment_density': float(np.mean(treatment_density)),
161
+ 'high_density_records': int(sum((emergency_density >= 2) & (treatment_density >= 2))),
162
+ 'precision_estimate': float(sum((emergency_density >= 1) & (treatment_density >= 1)) / len(df))
163
  }
164
 
165
+ # Print detailed results
166
+ print("\n Results:")
167
+ print(f" - Average emergency keyword density: {stats['path_b_validation']['avg_emergency_density']:.2f}")
168
+ print(f" - Average treatment keyword density: {stats['path_b_validation']['avg_treatment_density']:.2f}")
169
+ print(f" - High-density records (≥2 each): {stats['path_b_validation']['high_density_records']}")
170
+ print(f" - Precision estimate: {stats['path_b_validation']['precision_estimate']:.2f}")
171
+
172
+ # Sample distribution analysis
173
+ print("\n Density Distribution:")
174
+ density_counts = pd.DataFrame({
175
+ 'emergency': emergency_density,
176
+ 'treatment': treatment_density
177
+ }).value_counts().head()
178
+ print(" Top 5 density combinations (emergency, treatment):")
179
+ for (em, tr), count in density_counts.items():
180
+ print(f" - {count} documents have {em} emergency and {tr} treatment keywords")
181
 
182
  # Condition mapping candidates
183
  print("\n8️⃣ Preparing condition mapping candidates...")