Spaces:
Sleeping
Sleeping
YanBoChen
commited on
Commit
·
a5bcfa7
1
Parent(s):
7d8970e
WIP: add dual keyword and text length distribution plots for treatment subset analysis
Browse files
dataset/scripts/data_explorer_treatment.py
CHANGED
@@ -84,11 +84,10 @@ def analyze_treatment_subset(
|
|
84 |
stats['treatment_keyword_stats'][keyword] = int(count)
|
85 |
print(f" Treatment: {keyword} -> {count} records")
|
86 |
|
87 |
-
# Co-occurrence analysis
|
88 |
print("\n6️⃣ Computing keyword co-occurrence patterns...")
|
89 |
-
print(" Creating boolean matrices...")
|
90 |
|
91 |
-
# Initialize
|
92 |
emergency_matrix = np.zeros((len(df), len(emergency_keywords)), dtype=bool)
|
93 |
treatment_matrix = np.zeros((len(df), len(treatment_keywords)), dtype=bool)
|
94 |
|
@@ -96,35 +95,26 @@ def analyze_treatment_subset(
|
|
96 |
print(" Pre-processing text...")
|
97 |
df['clean_text_lower'] = df['clean_text'].fillna('').str.lower()
|
98 |
|
99 |
-
#
|
100 |
-
print(" Processing emergency keywords...")
|
101 |
for i, keyword in enumerate(tqdm(emergency_keywords, desc="Emergency keywords")):
|
102 |
-
pattern = r'
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
regex=True,
|
107 |
-
na=False
|
108 |
-
).values
|
109 |
-
except Exception as e:
|
110 |
-
print(f" Warning: Error processing keyword '{keyword}': {str(e)}")
|
111 |
|
112 |
-
#
|
113 |
-
print(" Processing treatment keywords...")
|
114 |
for i, keyword in enumerate(tqdm(treatment_keywords, desc="Treatment keywords")):
|
115 |
-
pattern = r'
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
regex=True,
|
120 |
-
na=False
|
121 |
-
).values
|
122 |
-
except Exception as e:
|
123 |
-
print(f" Warning: Error processing keyword '{keyword}': {str(e)}")
|
124 |
|
125 |
-
# Compute co-occurrence
|
126 |
-
print(" Computing co-occurrence matrix...")
|
127 |
-
cooc_matrix = emergency_matrix.T @ treatment_matrix
|
|
|
128 |
|
129 |
# Extract results
|
130 |
print(" Extracting co-occurrence pairs...")
|
@@ -137,7 +127,7 @@ def analyze_treatment_subset(
|
|
137 |
'emergency_keyword': em_kw,
|
138 |
'treatment_keyword': tr_kw,
|
139 |
'cooccurrence_count': count,
|
140 |
-
'percentage': float(count /
|
141 |
})
|
142 |
|
143 |
# Sort and store results
|
@@ -149,35 +139,45 @@ def analyze_treatment_subset(
|
|
149 |
for i, pair in enumerate(cooccurrence_pairs[:5]):
|
150 |
print(f" {i+1}. {pair['emergency_keyword']} + {pair['treatment_keyword']}: {pair['cooccurrence_count']} ({pair['percentage']:.1f}%)")
|
151 |
|
152 |
-
# Path B validation metrics
|
153 |
print("\n7️⃣ Validating Path B strategy effectiveness...")
|
154 |
|
155 |
-
#
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
tr_matches = sum(1 for kw in treatment_keywords if kw.lower() in text)
|
163 |
-
|
164 |
-
emergency_density.append(em_matches)
|
165 |
-
treatment_density.append(tr_matches)
|
166 |
|
|
|
167 |
df['emergency_keyword_density'] = emergency_density
|
168 |
df['treatment_keyword_density'] = treatment_density
|
169 |
|
|
|
170 |
stats['path_b_validation'] = {
|
171 |
'avg_emergency_density': float(np.mean(emergency_density)),
|
172 |
'avg_treatment_density': float(np.mean(treatment_density)),
|
173 |
-
'high_density_records': int(sum(
|
174 |
-
'precision_estimate': float(sum(
|
175 |
}
|
176 |
|
177 |
-
|
178 |
-
print(
|
179 |
-
print(f"
|
180 |
-
print(f"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
181 |
|
182 |
# Condition mapping candidates
|
183 |
print("\n8️⃣ Preparing condition mapping candidates...")
|
|
|
84 |
stats['treatment_keyword_stats'][keyword] = int(count)
|
85 |
print(f" Treatment: {keyword} -> {count} records")
|
86 |
|
87 |
+
# Step 6: Co-occurrence analysis
|
88 |
print("\n6️⃣ Computing keyword co-occurrence patterns...")
|
|
|
89 |
|
90 |
+
# Initialize matrices for full dataset
|
91 |
emergency_matrix = np.zeros((len(df), len(emergency_keywords)), dtype=bool)
|
92 |
treatment_matrix = np.zeros((len(df), len(treatment_keywords)), dtype=bool)
|
93 |
|
|
|
95 |
print(" Pre-processing text...")
|
96 |
df['clean_text_lower'] = df['clean_text'].fillna('').str.lower()
|
97 |
|
98 |
+
# Process all emergency keywords
|
99 |
+
print("\n Processing all emergency keywords...")
|
100 |
for i, keyword in enumerate(tqdm(emergency_keywords, desc="Emergency keywords")):
|
101 |
+
pattern = r'(?<!\w)' + re.escape(keyword) + r'(?!\w)'
|
102 |
+
emergency_matrix[:, i] = df['clean_text_lower'].str.contains(pattern, regex=True, na=False)
|
103 |
+
matches = emergency_matrix[:, i].sum()
|
104 |
+
print(f" - {keyword}: {matches} matches")
|
|
|
|
|
|
|
|
|
|
|
105 |
|
106 |
+
# Process all treatment keywords
|
107 |
+
print("\n Processing all treatment keywords...")
|
108 |
for i, keyword in enumerate(tqdm(treatment_keywords, desc="Treatment keywords")):
|
109 |
+
pattern = r'(?<!\w)' + re.escape(keyword) + r'(?!\w)'
|
110 |
+
treatment_matrix[:, i] = df['clean_text_lower'].str.contains(pattern, regex=True, na=False)
|
111 |
+
matches = treatment_matrix[:, i].sum()
|
112 |
+
print(f" - {keyword}: {matches} matches")
|
|
|
|
|
|
|
|
|
|
|
113 |
|
114 |
+
# Compute co-occurrence matrix
|
115 |
+
print("\n Computing co-occurrence matrix...")
|
116 |
+
cooc_matrix = emergency_matrix.astype(int).T @ treatment_matrix.astype(int)
|
117 |
+
print(" Computation completed successfully")
|
118 |
|
119 |
# Extract results
|
120 |
print(" Extracting co-occurrence pairs...")
|
|
|
127 |
'emergency_keyword': em_kw,
|
128 |
'treatment_keyword': tr_kw,
|
129 |
'cooccurrence_count': count,
|
130 |
+
'percentage': float(count / len(df) * 100)
|
131 |
})
|
132 |
|
133 |
# Sort and store results
|
|
|
139 |
for i, pair in enumerate(cooccurrence_pairs[:5]):
|
140 |
print(f" {i+1}. {pair['emergency_keyword']} + {pair['treatment_keyword']}: {pair['cooccurrence_count']} ({pair['percentage']:.1f}%)")
|
141 |
|
142 |
+
# Step 7: Path B validation metrics
|
143 |
print("\n7️⃣ Validating Path B strategy effectiveness...")
|
144 |
|
145 |
+
# Compute keyword density with progress bar
|
146 |
+
print(" Computing keyword density...")
|
147 |
+
with tqdm(total=2, desc="Density calculation") as pbar:
|
148 |
+
emergency_density = emergency_matrix.sum(axis=1)
|
149 |
+
pbar.update(1)
|
150 |
+
treatment_density = treatment_matrix.sum(axis=1)
|
151 |
+
pbar.update(1)
|
|
|
|
|
|
|
|
|
152 |
|
153 |
+
# Store density in dataframe
|
154 |
df['emergency_keyword_density'] = emergency_density
|
155 |
df['treatment_keyword_density'] = treatment_density
|
156 |
|
157 |
+
# Calculate statistics
|
158 |
stats['path_b_validation'] = {
|
159 |
'avg_emergency_density': float(np.mean(emergency_density)),
|
160 |
'avg_treatment_density': float(np.mean(treatment_density)),
|
161 |
+
'high_density_records': int(sum((emergency_density >= 2) & (treatment_density >= 2))),
|
162 |
+
'precision_estimate': float(sum((emergency_density >= 1) & (treatment_density >= 1)) / len(df))
|
163 |
}
|
164 |
|
165 |
+
# Print detailed results
|
166 |
+
print("\n Results:")
|
167 |
+
print(f" - Average emergency keyword density: {stats['path_b_validation']['avg_emergency_density']:.2f}")
|
168 |
+
print(f" - Average treatment keyword density: {stats['path_b_validation']['avg_treatment_density']:.2f}")
|
169 |
+
print(f" - High-density records (≥2 each): {stats['path_b_validation']['high_density_records']}")
|
170 |
+
print(f" - Precision estimate: {stats['path_b_validation']['precision_estimate']:.2f}")
|
171 |
+
|
172 |
+
# Sample distribution analysis
|
173 |
+
print("\n Density Distribution:")
|
174 |
+
density_counts = pd.DataFrame({
|
175 |
+
'emergency': emergency_density,
|
176 |
+
'treatment': treatment_density
|
177 |
+
}).value_counts().head()
|
178 |
+
print(" Top 5 density combinations (emergency, treatment):")
|
179 |
+
for (em, tr), count in density_counts.items():
|
180 |
+
print(f" - {count} documents have {em} emergency and {tr} treatment keywords")
|
181 |
|
182 |
# Condition mapping candidates
|
183 |
print("\n8️⃣ Preparing condition mapping candidates...")
|