Spaces:
Sleeping
Sleeping
Merge pull request #3 from YanBoChen0928/generation
Browse filesGeneration (complete user_input-> user_prompt (llm_client,medical_condition) ->retrieval.py->generation.py process)
- .env.example +2 -0
- .gitignore +31 -0
- requirements.txt +9 -0
- src/Todo_20250731_Multlevel_Fallback_Fix.md +83 -0
- src/__init__.py +12 -2
- src/data_processing.py +116 -67
- src/generation.py +519 -0
- src/llm_clients.py +308 -0
- src/medical_conditions.py +99 -0
- src/retrieval.py +391 -0
- src/user_prompt.py +562 -0
- test_retrieval_pipeline.py +223 -0
- tests/requirements.txt +95 -0
- tests/result_of_test_end_to_end_pipeline.md +0 -0
- tests/result_of_test_multlevel_fallback_validation.md +570 -0
- tests/result_of_test_multlevel_fallback_validation_revised.md +534 -0
- tests/result_of_test_userinput_userprompt_medical_condition_llm.md +381 -0
- tests/test_chunk_quality_analysis.py +333 -0
- tests/test_data_processing.py +99 -8
- tests/test_embedding_and_index.py +98 -26
- tests/test_embedding_validation.py +99 -15
- tests/test_end_to_end_pipeline.py +473 -0
- tests/test_multilevel_fallback_validation.py +553 -0
- tests/test_retrieval.py +206 -0
- tests/test_user_prompt.py +92 -0
- tests/test_userinput_userprompt_medical_condition_llm_retrieval.py +479 -0
.env.example
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
# .env.example document
|
2 |
+
HF_TOKEN=your_huggingface_token_here
|
.gitignore
CHANGED
@@ -1,17 +1,27 @@
|
|
1 |
# 🧠 Virtual environments
|
2 |
genAIvenv/
|
3 |
.final_project_env/
|
|
|
|
|
|
|
|
|
4 |
|
5 |
# 💻 OS / Editor garbage
|
6 |
.DS_Store
|
7 |
.vscode/
|
|
|
|
|
|
|
|
|
8 |
|
9 |
# 📁 Documentation and project folders
|
10 |
docs/
|
11 |
dataset/dataset/
|
|
|
12 |
|
13 |
# 🧾 Compiled / output files
|
14 |
*.pyc
|
|
|
15 |
*.log
|
16 |
*.zip
|
17 |
*.tar.gz
|
@@ -20,6 +30,15 @@ dataset/dataset/
|
|
20 |
*.json
|
21 |
*.png
|
22 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
# 🚫 Large files - models
|
24 |
models/cache/
|
25 |
models/cache/*.pkl
|
@@ -32,3 +51,15 @@ models/indices/annoy/*.ann
|
|
32 |
*.pkl
|
33 |
*.npy
|
34 |
*.ann
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
# 🧠 Virtual environments
|
2 |
genAIvenv/
|
3 |
.final_project_env/
|
4 |
+
.env
|
5 |
+
.venv
|
6 |
+
env/
|
7 |
+
venv/
|
8 |
|
9 |
# 💻 OS / Editor garbage
|
10 |
.DS_Store
|
11 |
.vscode/
|
12 |
+
*.swp
|
13 |
+
*~
|
14 |
+
.idea/
|
15 |
+
*.iml
|
16 |
|
17 |
# 📁 Documentation and project folders
|
18 |
docs/
|
19 |
dataset/dataset/
|
20 |
+
cache/
|
21 |
|
22 |
# 🧾 Compiled / output files
|
23 |
*.pyc
|
24 |
+
__pycache__/
|
25 |
*.log
|
26 |
*.zip
|
27 |
*.tar.gz
|
|
|
30 |
*.json
|
31 |
*.png
|
32 |
|
33 |
+
# 🔑 Secrets and configs
|
34 |
+
.env
|
35 |
+
.env.local
|
36 |
+
.env.*.local
|
37 |
+
*.pem
|
38 |
+
credentials.json
|
39 |
+
token.json
|
40 |
+
*.mdc
|
41 |
+
|
42 |
# 🚫 Large files - models
|
43 |
models/cache/
|
44 |
models/cache/*.pkl
|
|
|
51 |
*.pkl
|
52 |
*.npy
|
53 |
*.ann
|
54 |
+
|
55 |
+
# 📊 Jupyter Notebook
|
56 |
+
.ipynb_checkpoints
|
57 |
+
*/.ipynb_checkpoints/*
|
58 |
+
*.ipynb_checkpoints*
|
59 |
+
|
60 |
+
# 📝 Coverage reports
|
61 |
+
htmlcov/
|
62 |
+
.coverage
|
63 |
+
.coverage.*
|
64 |
+
coverage.xml
|
65 |
+
*.cover
|
requirements.txt
CHANGED
@@ -32,13 +32,16 @@ huggingface-hub==0.33.4
|
|
32 |
idna==3.10
|
33 |
Jinja2==3.1.6
|
34 |
jiter==0.10.0
|
|
|
35 |
kiwisolver==1.4.8
|
36 |
markdown-it-py==3.0.0
|
37 |
MarkupSafe==3.0.2
|
38 |
matplotlib==3.10.3
|
39 |
mdurl==0.1.2
|
|
|
40 |
multidict==6.6.3
|
41 |
multiprocess==0.70.16
|
|
|
42 |
numpy==2.3.1
|
43 |
openai==1.97.0
|
44 |
orjson==3.11.0
|
@@ -53,6 +56,7 @@ pydub==0.25.1
|
|
53 |
Pygments==2.19.2
|
54 |
pyparsing==3.2.3
|
55 |
python-dateutil==2.9.0.post0
|
|
|
56 |
python-multipart==0.0.20
|
57 |
pytz==2025.2
|
58 |
PyYAML==6.0.2
|
@@ -62,6 +66,8 @@ rich==14.0.0
|
|
62 |
ruff==0.12.4
|
63 |
safehttpx==0.1.6
|
64 |
safetensors==0.5.3
|
|
|
|
|
65 |
seaborn==0.13.2
|
66 |
semantic-version==2.10.0
|
67 |
sentence-transformers==3.0.1
|
@@ -69,8 +75,11 @@ shellingham==1.5.4
|
|
69 |
six==1.17.0
|
70 |
sniffio==1.3.1
|
71 |
starlette==0.47.2
|
|
|
|
|
72 |
tokenizers==0.21.2
|
73 |
tomlkit==0.13.3
|
|
|
74 |
tqdm==4.67.1
|
75 |
transformers==4.53.2
|
76 |
typer==0.16.0
|
|
|
32 |
idna==3.10
|
33 |
Jinja2==3.1.6
|
34 |
jiter==0.10.0
|
35 |
+
joblib==1.5.1
|
36 |
kiwisolver==1.4.8
|
37 |
markdown-it-py==3.0.0
|
38 |
MarkupSafe==3.0.2
|
39 |
matplotlib==3.10.3
|
40 |
mdurl==0.1.2
|
41 |
+
mpmath==1.3.0
|
42 |
multidict==6.6.3
|
43 |
multiprocess==0.70.16
|
44 |
+
networkx==3.5
|
45 |
numpy==2.3.1
|
46 |
openai==1.97.0
|
47 |
orjson==3.11.0
|
|
|
56 |
Pygments==2.19.2
|
57 |
pyparsing==3.2.3
|
58 |
python-dateutil==2.9.0.post0
|
59 |
+
python-dotenv==1.1.1
|
60 |
python-multipart==0.0.20
|
61 |
pytz==2025.2
|
62 |
PyYAML==6.0.2
|
|
|
66 |
ruff==0.12.4
|
67 |
safehttpx==0.1.6
|
68 |
safetensors==0.5.3
|
69 |
+
scikit-learn==1.7.1
|
70 |
+
scipy==1.16.1
|
71 |
seaborn==0.13.2
|
72 |
semantic-version==2.10.0
|
73 |
sentence-transformers==3.0.1
|
|
|
75 |
six==1.17.0
|
76 |
sniffio==1.3.1
|
77 |
starlette==0.47.2
|
78 |
+
sympy==1.14.0
|
79 |
+
threadpoolctl==3.6.0
|
80 |
tokenizers==0.21.2
|
81 |
tomlkit==0.13.3
|
82 |
+
torch==2.7.1
|
83 |
tqdm==4.67.1
|
84 |
transformers==4.53.2
|
85 |
typer==0.16.0
|
src/Todo_20250731_Multlevel_Fallback_Fix.md
ADDED
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
太棒了!讓我們一起分析這個詳細的測試結果,這些數據很有價值!
|
2 |
+
|
3 |
+
## 📊 **測試結果深度分析**
|
4 |
+
|
5 |
+
### **✅ 成功的部分 (53.8% 成功率)**
|
6 |
+
|
7 |
+
**Level 1 - 預定義映射**: 💯 **完美表現**
|
8 |
+
- 3/3 測試通過,平均 0.000s(瞬間響應)
|
9 |
+
- `acute myocardial infarction`, `acute stroke`, `pulmonary embolism` 都直接命中
|
10 |
+
|
11 |
+
**Level 4b→5 - 冷門醫療查詢**: 💯 **完美表現**
|
12 |
+
- 3/3 測試通過,正確進入 generic search
|
13 |
+
- 罕見血液疾病、ITP、壞死性筋膜炎都正確處理
|
14 |
+
|
15 |
+
### **🔍 發現的關鍵問題**
|
16 |
+
|
17 |
+
#### **問題1: Level 4 驗證機制失效** ❌
|
18 |
+
**現象**: 非醫療查詢(烹飪、編程、天氣)都被當作醫療查詢處理
|
19 |
+
```
|
20 |
+
- "how to cook pasta properly?" → Level 5 (應該被拒絕)
|
21 |
+
- "programming language" → Level 5 (應該被拒絕)
|
22 |
+
- "weather forecast" → Level 5 (應該被拒絕)
|
23 |
+
```
|
24 |
+
|
25 |
+
**根本原因**: `validate_medical_query` 邏輯有問題
|
26 |
+
- LLM 雖然說"這不是醫療查詢",但函數仍然返回 `None`(表示通過驗證)
|
27 |
+
- 應該檢查 LLM 回應中是否明確說明"非醫療"
|
28 |
+
|
29 |
+
#### **問題2: Level 3 語義搜索邏輯問題** ⚠️
|
30 |
+
**現象**: 期望 Level 3 的查詢都跳到了 Level 5
|
31 |
+
```
|
32 |
+
- "emergency management of cardiovascular crisis" → Level 5 (期望 Level 3)
|
33 |
+
- "urgent neurological intervention protocols" → Level 5 (期望 Level 3)
|
34 |
+
```
|
35 |
+
|
36 |
+
**原因**: `_infer_condition_from_text` 方法可能過於嚴格,無法推斷出有效條件
|
37 |
+
|
38 |
+
#### **問題3: Level 2 行為不一致** ⚠️
|
39 |
+
**現象**:
|
40 |
+
- `level2_001` 成功,但被 Level 1 攔截了(LLM 提取了已知條件)
|
41 |
+
- `level2_002` 失敗,LLM 提取了條件但驗證失敗
|
42 |
+
|
43 |
+
## 🛠️ **需要修正的優先順序**
|
44 |
+
|
45 |
+
### **Priority 1: 修正 validate_medical_query**
|
46 |
+
```python
|
47 |
+
def validate_medical_query(self, user_query: str) -> Optional[Dict[str, Any]]:
|
48 |
+
# 檢查 LLM 回應是否明確說明非醫療
|
49 |
+
if llama_result.get('extracted_condition'):
|
50 |
+
response_text = llama_result.get('raw_response', '').lower()
|
51 |
+
|
52 |
+
# 檢查是否明確拒絕醫療查詢
|
53 |
+
rejection_phrases = [
|
54 |
+
"not a medical condition",
|
55 |
+
"outside my medical scope",
|
56 |
+
"unrelated to medical conditions",
|
57 |
+
"do not address"
|
58 |
+
]
|
59 |
+
|
60 |
+
if any(phrase in response_text for phrase in rejection_phrases):
|
61 |
+
return self._generate_invalid_query_response()
|
62 |
+
|
63 |
+
return None # 通過驗證
|
64 |
+
```
|
65 |
+
|
66 |
+
### **Priority 2: 改進語義搜索條件推斷**
|
67 |
+
`_infer_condition_from_text` 的相似度閾值可能太高(0.7),建議降低到 0.5
|
68 |
+
|
69 |
+
### **Priority 3: 優化 Level 2 LLM 提取驗證**
|
70 |
+
確保 `validate_condition` 能正確處理 LLM 的複雜回應
|
71 |
+
|
72 |
+
## 🎯 **整體評估**
|
73 |
+
|
74 |
+
### **速度表現**: ⭐⭐⭐⭐⭐
|
75 |
+
- Level 1: 瞬間響應 (0.000s)
|
76 |
+
- 平均: 14.4s(主要是 LLM 調用造成的)
|
77 |
+
|
78 |
+
### **準確性**: ⭐⭐⭐
|
79 |
+
- 預定義條件: 100% 準確
|
80 |
+
- 冷門醫療: 100% 準確
|
81 |
+
- 非醫療拒絕: 0% 準確 ← **需要立即修正**
|
82 |
+
|
83 |
+
你希望我先修正 `validate_medical_query` 的邏輯嗎?這是最關鍵的問題,解決後整體成功率應該能提升到 80%+。
|
src/__init__.py
CHANGED
@@ -3,6 +3,16 @@ OnCall.ai src package
|
|
3 |
|
4 |
This package contains the core implementation of the OnCall.ai system.
|
5 |
"""
|
6 |
-
|
7 |
# Version
|
8 |
-
__version__ = '0.1.0'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
|
4 |
This package contains the core implementation of the OnCall.ai system.
|
5 |
"""
|
6 |
+
|
7 |
# Version
|
8 |
+
__version__ = '0.1.0'
|
9 |
+
|
10 |
+
# import key modules
|
11 |
+
from .llm_clients import llm_Med42_70BClient
|
12 |
+
from .user_prompt import UserPromptProcessor
|
13 |
+
from .retrieval import BasicRetrievalSystem
|
14 |
+
from .medical_conditions import (
|
15 |
+
CONDITION_KEYWORD_MAPPING,
|
16 |
+
get_condition_keywords,
|
17 |
+
validate_condition
|
18 |
+
)
|
src/data_processing.py
CHANGED
@@ -106,7 +106,7 @@ class DataProcessor:
|
|
106 |
raise FileNotFoundError(f"Treatment data not found: {treatment_path}")
|
107 |
|
108 |
# Load data
|
109 |
-
self.emergency_data = pd.read_json(str(emergency_path), lines=True) #
|
110 |
self.treatment_data = pd.read_json(str(treatment_path), lines=True)
|
111 |
|
112 |
logger.info(f"Loaded {len(self.emergency_data)} emergency records")
|
@@ -167,11 +167,8 @@ class DataProcessor:
|
|
167 |
# Get the keyword text (already lowercase)
|
168 |
actual_keyword = text[keyword_pos:keyword_pos + len(keyword)]
|
169 |
|
170 |
-
# Calculate rough window size using
|
171 |
-
|
172 |
-
# Use 512 tokens as target (model's max limit)
|
173 |
-
ROUGH_CHUNK_TARGET_TOKENS = 512
|
174 |
-
char_window = int(ROUGH_CHUNK_TARGET_TOKENS * chars_per_token / 2)
|
175 |
|
176 |
# Get rough chunk boundaries in characters
|
177 |
rough_start = max(0, keyword_pos - char_window)
|
@@ -231,73 +228,119 @@ class DataProcessor:
|
|
231 |
return chunks
|
232 |
|
233 |
def create_dual_keyword_chunks(self, text: str, emergency_keywords: str,
|
234 |
-
treatment_keywords: str, chunk_size: int =
|
235 |
doc_id: str = None) -> List[Dict[str, Any]]:
|
236 |
"""
|
237 |
Create chunks for treatment data with both emergency and treatment keywords
|
|
|
238 |
|
239 |
Args:
|
240 |
text: Input text
|
241 |
-
emergency_keywords: Emergency keywords
|
242 |
-
treatment_keywords: Treatment keywords
|
243 |
-
chunk_size: Size of each chunk
|
244 |
doc_id: Document ID for tracking
|
245 |
|
246 |
Returns:
|
247 |
-
List of chunk dictionaries
|
248 |
"""
|
249 |
-
if not treatment_keywords or pd.isna(treatment_keywords):
|
250 |
-
return []
|
251 |
-
|
252 |
chunks = []
|
253 |
-
|
254 |
-
tr_keywords = treatment_keywords.split("|") if treatment_keywords else []
|
255 |
|
256 |
-
#
|
257 |
-
|
258 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
259 |
|
260 |
-
|
261 |
-
|
262 |
-
|
263 |
-
|
|
|
|
|
|
|
264 |
|
265 |
-
for
|
266 |
-
|
267 |
-
if
|
268 |
-
|
269 |
-
|
270 |
-
|
271 |
-
|
272 |
|
273 |
-
#
|
274 |
-
|
275 |
-
|
276 |
-
em_pos = text.lower().find(closest_em_keyword.lower())
|
277 |
-
center = (tr_pos + em_pos) // 2
|
278 |
-
else:
|
279 |
-
# Center on treatment keyword
|
280 |
-
center = tr_pos
|
281 |
|
282 |
-
|
283 |
-
|
284 |
-
|
285 |
-
|
|
|
|
|
|
|
|
|
286 |
|
287 |
-
|
288 |
-
|
289 |
-
|
290 |
-
|
291 |
-
|
292 |
-
|
293 |
-
|
294 |
-
|
295 |
-
|
296 |
-
|
297 |
-
|
298 |
-
|
299 |
-
}
|
300 |
-
chunks.append(chunk_info)
|
301 |
|
302 |
return chunks
|
303 |
|
@@ -308,12 +351,14 @@ class DataProcessor:
|
|
308 |
|
309 |
all_chunks = []
|
310 |
|
311 |
-
# Add progress bar
|
312 |
for idx, row in tqdm(self.emergency_data.iterrows(),
|
313 |
total=len(self.emergency_data),
|
314 |
-
desc="Processing
|
315 |
-
unit="
|
316 |
-
leave=
|
|
|
|
|
317 |
if pd.notna(row.get('clean_text')) and pd.notna(row.get('matched')):
|
318 |
chunks = self.create_keyword_centered_chunks(
|
319 |
text=row['clean_text'],
|
@@ -345,12 +390,14 @@ class DataProcessor:
|
|
345 |
|
346 |
all_chunks = []
|
347 |
|
348 |
-
# Add progress bar
|
349 |
for idx, row in tqdm(self.treatment_data.iterrows(),
|
350 |
total=len(self.treatment_data),
|
351 |
-
desc="Processing
|
352 |
-
unit="
|
353 |
-
leave=
|
|
|
|
|
354 |
if (pd.notna(row.get('clean_text')) and
|
355 |
pd.notna(row.get('treatment_matched'))):
|
356 |
|
@@ -454,10 +501,12 @@ class DataProcessor:
|
|
454 |
logger.info(f"Processing {len(texts)} new {chunk_type} texts in {total_batches} batches...")
|
455 |
|
456 |
for i in tqdm(range(0, len(texts), batch_size),
|
457 |
-
desc=f"Embedding {chunk_type}
|
458 |
total=total_batches,
|
459 |
-
unit="
|
460 |
-
leave=
|
|
|
|
|
461 |
batch_texts = texts[i:i + batch_size]
|
462 |
batch_emb = model.encode(
|
463 |
batch_texts,
|
|
|
106 |
raise FileNotFoundError(f"Treatment data not found: {treatment_path}")
|
107 |
|
108 |
# Load data
|
109 |
+
self.emergency_data = pd.read_json(str(emergency_path), lines=True) # use str() to ensure path is correct
|
110 |
self.treatment_data = pd.read_json(str(treatment_path), lines=True)
|
111 |
|
112 |
logger.info(f"Loaded {len(self.emergency_data)} emergency records")
|
|
|
167 |
# Get the keyword text (already lowercase)
|
168 |
actual_keyword = text[keyword_pos:keyword_pos + len(keyword)]
|
169 |
|
170 |
+
# Calculate rough window size using simple ratio
|
171 |
+
char_window = int(chunk_size * chars_per_token / 2)
|
|
|
|
|
|
|
172 |
|
173 |
# Get rough chunk boundaries in characters
|
174 |
rough_start = max(0, keyword_pos - char_window)
|
|
|
228 |
return chunks
|
229 |
|
230 |
def create_dual_keyword_chunks(self, text: str, emergency_keywords: str,
|
231 |
+
treatment_keywords: str, chunk_size: int = None,
|
232 |
doc_id: str = None) -> List[Dict[str, Any]]:
|
233 |
"""
|
234 |
Create chunks for treatment data with both emergency and treatment keywords
|
235 |
+
using token-based separate chunking strategy with enhanced metadata for treatment chunks
|
236 |
|
237 |
Args:
|
238 |
text: Input text
|
239 |
+
emergency_keywords: Emergency keywords (pipe-separated)
|
240 |
+
treatment_keywords: Treatment keywords (pipe-separated)
|
241 |
+
chunk_size: Size of each chunk in tokens (defaults to self.chunk_size)
|
242 |
doc_id: Document ID for tracking
|
243 |
|
244 |
Returns:
|
245 |
+
List of chunk dictionaries with enhanced metadata for treatment chunks
|
246 |
"""
|
|
|
|
|
|
|
247 |
chunks = []
|
248 |
+
chunk_size = chunk_size or self.chunk_size
|
|
|
249 |
|
250 |
+
# Case 1: No keywords present
|
251 |
+
if not emergency_keywords and not treatment_keywords:
|
252 |
+
return []
|
253 |
+
|
254 |
+
# Case 2: Only emergency keywords (early return)
|
255 |
+
if emergency_keywords and not treatment_keywords:
|
256 |
+
em_chunks = self.create_keyword_centered_chunks(
|
257 |
+
text=text,
|
258 |
+
matched_keywords=emergency_keywords,
|
259 |
+
chunk_size=chunk_size,
|
260 |
+
doc_id=doc_id
|
261 |
+
)
|
262 |
+
for chunk in em_chunks:
|
263 |
+
chunk['source_type'] = 'emergency'
|
264 |
+
return em_chunks
|
265 |
+
|
266 |
+
# Case 3: Only treatment keywords (early return)
|
267 |
+
if treatment_keywords and not emergency_keywords:
|
268 |
+
tr_chunks = self.create_keyword_centered_chunks(
|
269 |
+
text=text,
|
270 |
+
matched_keywords=treatment_keywords,
|
271 |
+
chunk_size=chunk_size,
|
272 |
+
doc_id=doc_id
|
273 |
+
)
|
274 |
+
for chunk in tr_chunks:
|
275 |
+
chunk['source_type'] = 'treatment'
|
276 |
+
chunk['contains_treatment_kws'] = treatment_keywords.split('|')
|
277 |
+
chunk['contains_emergency_kws'] = []
|
278 |
+
chunk['match_type'] = 'treatment_only'
|
279 |
+
return tr_chunks
|
280 |
+
|
281 |
+
# Case 4: Both keywords present - separate processing
|
282 |
+
# Process emergency keywords
|
283 |
+
if emergency_keywords:
|
284 |
+
em_chunks = self.create_keyword_centered_chunks(
|
285 |
+
text=text,
|
286 |
+
matched_keywords=emergency_keywords,
|
287 |
+
chunk_size=chunk_size,
|
288 |
+
doc_id=doc_id
|
289 |
+
)
|
290 |
+
for chunk in em_chunks:
|
291 |
+
chunk['source_type'] = 'emergency'
|
292 |
+
chunks.extend(em_chunks)
|
293 |
+
|
294 |
+
# Process treatment keywords
|
295 |
+
if treatment_keywords:
|
296 |
+
tr_chunks = self.create_keyword_centered_chunks(
|
297 |
+
text=text,
|
298 |
+
matched_keywords=treatment_keywords,
|
299 |
+
chunk_size=chunk_size,
|
300 |
+
doc_id=doc_id
|
301 |
+
)
|
302 |
|
303 |
+
# Parse keywords for metadata
|
304 |
+
em_kws = emergency_keywords.split('|') if emergency_keywords else []
|
305 |
+
tr_kws = treatment_keywords.split('|') if treatment_keywords else []
|
306 |
+
|
307 |
+
# Add metadata for each treatment chunk
|
308 |
+
for i, chunk in enumerate(tr_chunks):
|
309 |
+
chunk_text = chunk['text'].lower()
|
310 |
|
311 |
+
# Check for keyword presence in chunk text
|
312 |
+
contains_emergency_kws = [
|
313 |
+
kw for kw in em_kws if kw.lower() in chunk_text
|
314 |
+
]
|
315 |
+
contains_treatment_kws = [
|
316 |
+
kw for kw in tr_kws if kw.lower() in chunk_text
|
317 |
+
]
|
318 |
|
319 |
+
# Determine match type based on keyword presence
|
320 |
+
has_emergency = len(contains_emergency_kws) > 0
|
321 |
+
has_treatment = len(contains_treatment_kws) > 0
|
|
|
|
|
|
|
|
|
|
|
322 |
|
323 |
+
if has_emergency and has_treatment:
|
324 |
+
match_type = "both"
|
325 |
+
elif has_emergency:
|
326 |
+
match_type = "emergency_only"
|
327 |
+
elif has_treatment:
|
328 |
+
match_type = "treatment_only"
|
329 |
+
else:
|
330 |
+
match_type = "none"
|
331 |
|
332 |
+
# Update chunk metadata
|
333 |
+
chunk.update({
|
334 |
+
'source_type': 'treatment',
|
335 |
+
'contains_emergency_kws': contains_emergency_kws,
|
336 |
+
'contains_treatment_kws': contains_treatment_kws,
|
337 |
+
'match_type': match_type,
|
338 |
+
'emergency_keywords': emergency_keywords, # Store original metadata
|
339 |
+
'treatment_keywords': treatment_keywords,
|
340 |
+
'chunk_id': f"{doc_id}_treatment_chunk_{i}" if doc_id else f"treatment_chunk_{i}"
|
341 |
+
})
|
342 |
+
|
343 |
+
chunks.extend(tr_chunks)
|
|
|
|
|
344 |
|
345 |
return chunks
|
346 |
|
|
|
351 |
|
352 |
all_chunks = []
|
353 |
|
354 |
+
# Add simplified progress bar
|
355 |
for idx, row in tqdm(self.emergency_data.iterrows(),
|
356 |
total=len(self.emergency_data),
|
357 |
+
desc="Emergency Processing",
|
358 |
+
unit="docs",
|
359 |
+
leave=True,
|
360 |
+
ncols=80,
|
361 |
+
mininterval=1.0):
|
362 |
if pd.notna(row.get('clean_text')) and pd.notna(row.get('matched')):
|
363 |
chunks = self.create_keyword_centered_chunks(
|
364 |
text=row['clean_text'],
|
|
|
390 |
|
391 |
all_chunks = []
|
392 |
|
393 |
+
# Add simplified progress bar
|
394 |
for idx, row in tqdm(self.treatment_data.iterrows(),
|
395 |
total=len(self.treatment_data),
|
396 |
+
desc="Treatment Processing",
|
397 |
+
unit="docs",
|
398 |
+
leave=True,
|
399 |
+
ncols=80,
|
400 |
+
mininterval=1.0):
|
401 |
if (pd.notna(row.get('clean_text')) and
|
402 |
pd.notna(row.get('treatment_matched'))):
|
403 |
|
|
|
501 |
logger.info(f"Processing {len(texts)} new {chunk_type} texts in {total_batches} batches...")
|
502 |
|
503 |
for i in tqdm(range(0, len(texts), batch_size),
|
504 |
+
desc=f"Embedding {chunk_type}",
|
505 |
total=total_batches,
|
506 |
+
unit="batches",
|
507 |
+
leave=True,
|
508 |
+
ncols=80,
|
509 |
+
mininterval=0.5):
|
510 |
batch_texts = texts[i:i + batch_size]
|
511 |
batch_emb = model.encode(
|
512 |
batch_texts,
|
src/generation.py
ADDED
@@ -0,0 +1,519 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
OnCall.ai Medical Advice Generation Module
|
3 |
+
|
4 |
+
This module handles:
|
5 |
+
1. RAG prompt construction from retrieval results
|
6 |
+
2. Medical advice generation using Med42-70B
|
7 |
+
3. Response formatting and confidence assessment
|
8 |
+
4. Integration with multi-dataset architecture
|
9 |
+
|
10 |
+
Author: OnCall.ai Team
|
11 |
+
Date: 2025-07-31
|
12 |
+
"""
|
13 |
+
|
14 |
+
import logging
|
15 |
+
from typing import Dict, List, Optional, Any, Union
|
16 |
+
from datetime import datetime
|
17 |
+
import json
|
18 |
+
|
19 |
+
# Import existing LLM client
|
20 |
+
from llm_clients import llm_Med42_70BClient
|
21 |
+
|
22 |
+
# Configure logging
|
23 |
+
logging.basicConfig(
|
24 |
+
level=logging.INFO,
|
25 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
26 |
+
)
|
27 |
+
logger = logging.getLogger(__name__)
|
28 |
+
|
29 |
+
class MedicalAdviceGenerator:
|
30 |
+
"""
|
31 |
+
Core generation module for medical advice using RAG approach
|
32 |
+
"""
|
33 |
+
|
34 |
+
def __init__(self, llm_client: Optional[llm_Med42_70BClient] = None):
|
35 |
+
"""
|
36 |
+
Initialize medical advice generator
|
37 |
+
|
38 |
+
Args:
|
39 |
+
llm_client: Optional Med42-70B client, creates new if None
|
40 |
+
"""
|
41 |
+
self.llm_client = llm_client or llm_Med42_70BClient()
|
42 |
+
|
43 |
+
# Dataset source priorities for different intentions
|
44 |
+
self.dataset_priorities = {
|
45 |
+
"treatment": {
|
46 |
+
"emergency_subset": 2,
|
47 |
+
"treatment_subset": 4,
|
48 |
+
"symptom_subset": 0, # Reserved for Dataset B
|
49 |
+
"diagnosis_subset": 0 # Reserved for Dataset B
|
50 |
+
},
|
51 |
+
"diagnosis": {
|
52 |
+
"emergency_subset": 4,
|
53 |
+
"treatment_subset": 2,
|
54 |
+
"symptom_subset": 0, # Reserved for Dataset B
|
55 |
+
"diagnosis_subset": 0 # Reserved for Dataset B
|
56 |
+
},
|
57 |
+
# "STAT": {
|
58 |
+
# # NOTE: Use when query contains urgent indicators like "NOW", "STAT", "critical"
|
59 |
+
# "emergency_subset": 5,
|
60 |
+
# "treatment_subset": 1,
|
61 |
+
# "symptom_subset": 0, # Reserved for Dataset B
|
62 |
+
# "diagnosis_subset": 0 # Reserved for Dataset B
|
63 |
+
# }
|
64 |
+
}
|
65 |
+
|
66 |
+
logger.info("MedicalAdviceGenerator initialized")
|
67 |
+
|
68 |
+
def generate_medical_advice(self, user_query: str, retrieval_results: Dict[str, Any],
|
69 |
+
intention: Optional[str] = None) -> Dict[str, Any]:
|
70 |
+
"""
|
71 |
+
Complete pipeline: construct prompt → generate advice → format response
|
72 |
+
|
73 |
+
Args:
|
74 |
+
user_query: Original user medical query
|
75 |
+
retrieval_results: Results from BasicRetrievalSystem.search()
|
76 |
+
intention: Optional query intention ('treatment', 'diagnosis', 'STAT'(tentative))
|
77 |
+
|
78 |
+
Returns:
|
79 |
+
Dict containing formatted medical advice and metadata
|
80 |
+
"""
|
81 |
+
try:
|
82 |
+
logger.info(f"Generating medical advice for query: '{user_query[:50]}...'")
|
83 |
+
start_time = datetime.now()
|
84 |
+
|
85 |
+
# Step 1: Extract and classify chunks from retrieval results
|
86 |
+
classified_chunks = self._classify_retrieval_chunks(retrieval_results)
|
87 |
+
|
88 |
+
# Step 2: Build RAG prompt based on intention and chunk classification
|
89 |
+
rag_prompt = self.generate_prompt(user_query, classified_chunks, intention)
|
90 |
+
|
91 |
+
# Step 3: Generate medical advice using Med42-70B
|
92 |
+
generation_result = self._generate_with_med42(rag_prompt)
|
93 |
+
|
94 |
+
# Step 4: Format structured response
|
95 |
+
formatted_response = self._format_medical_response(
|
96 |
+
user_query=user_query,
|
97 |
+
generated_advice=generation_result,
|
98 |
+
chunks_used=classified_chunks,
|
99 |
+
intention=intention,
|
100 |
+
processing_time=(datetime.now() - start_time).total_seconds()
|
101 |
+
)
|
102 |
+
|
103 |
+
processing_duration = formatted_response.get('query_metadata', {}).get('processing_time_seconds', 0)
|
104 |
+
logger.info(f"Medical advice generated successfully in {processing_duration:.3f}s")
|
105 |
+
return formatted_response
|
106 |
+
|
107 |
+
except Exception as e:
|
108 |
+
logger.error(f"Medical advice generation failed: {e}")
|
109 |
+
return self._generate_error_response(user_query, str(e))
|
110 |
+
|
111 |
+
def generate_prompt(self, user_query: str, classified_chunks: Dict[str, List],
|
112 |
+
intention: Optional[str] = None) -> str:
|
113 |
+
"""
|
114 |
+
Enhanced prompt generator with flexible dataset integration
|
115 |
+
|
116 |
+
Args:
|
117 |
+
user_query: User's medical query
|
118 |
+
classified_chunks: Chunks classified by dataset source
|
119 |
+
intention: Query intention if detected
|
120 |
+
|
121 |
+
Returns:
|
122 |
+
Structured RAG prompt for Med42-70B
|
123 |
+
"""
|
124 |
+
logger.info(f"Generating prompt with intention: {intention}")
|
125 |
+
|
126 |
+
# Extract chunks by dataset source
|
127 |
+
emergency_chunks = classified_chunks.get("emergency_subset", [])
|
128 |
+
treatment_chunks = classified_chunks.get("treatment_subset", [])
|
129 |
+
symptom_chunks = classified_chunks.get("symptom_subset", []) # Dataset B (future)
|
130 |
+
diagnosis_chunks = classified_chunks.get("diagnosis_subset", []) # Dataset B (future)
|
131 |
+
|
132 |
+
# Select chunks based on intention or intelligent defaults
|
133 |
+
selected_chunks = self._select_chunks_by_intention(
|
134 |
+
intention=intention,
|
135 |
+
emergency_chunks=emergency_chunks,
|
136 |
+
treatment_chunks=treatment_chunks,
|
137 |
+
symptom_chunks=symptom_chunks,
|
138 |
+
diagnosis_chunks=diagnosis_chunks
|
139 |
+
)
|
140 |
+
|
141 |
+
# Build context block from selected chunks
|
142 |
+
context_block = self._build_context_block(selected_chunks)
|
143 |
+
|
144 |
+
# Construct medical RAG prompt
|
145 |
+
prompt = self._construct_medical_prompt(user_query, context_block, intention)
|
146 |
+
|
147 |
+
logger.info(f"Generated prompt with {len(selected_chunks)} chunks, {len(context_block)} chars")
|
148 |
+
return prompt
|
149 |
+
|
150 |
+
def _classify_retrieval_chunks(self, retrieval_results: Dict[str, Any]) -> Dict[str, List]:
|
151 |
+
"""
|
152 |
+
Classify retrieval chunks by dataset source
|
153 |
+
|
154 |
+
Args:
|
155 |
+
retrieval_results: Results from BasicRetrievalSystem.search()
|
156 |
+
|
157 |
+
Returns:
|
158 |
+
Dict mapping dataset sources to chunk lists
|
159 |
+
"""
|
160 |
+
classified = {
|
161 |
+
"emergency_subset": [],
|
162 |
+
"treatment_subset": [],
|
163 |
+
"symptom_subset": [], # Reserved for Dataset B
|
164 |
+
"diagnosis_subset": [] # Reserved for Dataset B
|
165 |
+
}
|
166 |
+
|
167 |
+
# Process results from current dual-index system
|
168 |
+
processed_results = retrieval_results.get('processed_results', [])
|
169 |
+
|
170 |
+
for chunk in processed_results:
|
171 |
+
chunk_type = chunk.get('type', 'unknown')
|
172 |
+
|
173 |
+
# Map current system types to dataset sources
|
174 |
+
if chunk_type == 'emergency':
|
175 |
+
classified["emergency_subset"].append(chunk)
|
176 |
+
elif chunk_type == 'treatment':
|
177 |
+
classified["treatment_subset"].append(chunk)
|
178 |
+
else:
|
179 |
+
# Unknown type, classify by content analysis or default to STAT (tentative)
|
180 |
+
logger.warning(f"Unknown chunk type: {chunk_type}, defaulting to STAT (tentative)")
|
181 |
+
classified["emergency_subset"].append(chunk)
|
182 |
+
|
183 |
+
# TODO: Future integration point for Dataset B
|
184 |
+
# When Dataset B team provides symptom/diagnosis data:
|
185 |
+
# classified["symptom_subset"] = process_dataset_b_symptoms(retrieval_results)
|
186 |
+
# classified["diagnosis_subset"] = process_dataset_b_diagnosis(retrieval_results)
|
187 |
+
|
188 |
+
logger.info(f"Classified chunks: Emergency={len(classified['emergency_subset'])}, "
|
189 |
+
f"Treatment={len(classified['treatment_subset'])}")
|
190 |
+
|
191 |
+
return classified
|
192 |
+
|
193 |
+
def _select_chunks_by_intention(self, intention: Optional[str],
|
194 |
+
emergency_chunks: List, treatment_chunks: List,
|
195 |
+
symptom_chunks: List, diagnosis_chunks: List) -> List:
|
196 |
+
"""
|
197 |
+
Select optimal chunk combination based on query intention
|
198 |
+
|
199 |
+
Args:
|
200 |
+
intention: Detected or specified intention
|
201 |
+
*_chunks: Chunks from different dataset sources
|
202 |
+
|
203 |
+
Returns:
|
204 |
+
List of selected chunks for prompt construction
|
205 |
+
"""
|
206 |
+
if intention and intention in self.dataset_priorities:
|
207 |
+
# Use predefined priorities for known intentions
|
208 |
+
priorities = self.dataset_priorities[intention]
|
209 |
+
selected_chunks = []
|
210 |
+
|
211 |
+
# Add chunks according to priority allocation
|
212 |
+
selected_chunks.extend(emergency_chunks[:priorities["emergency_subset"]])
|
213 |
+
selected_chunks.extend(treatment_chunks[:priorities["treatment_subset"]])
|
214 |
+
|
215 |
+
# TODO: Future Dataset B integration
|
216 |
+
# selected_chunks.extend(symptom_chunks[:priorities["symptom_subset"]])
|
217 |
+
# selected_chunks.extend(diagnosis_chunks[:priorities["diagnosis_subset"]])
|
218 |
+
|
219 |
+
logger.info(f"Selected chunks by intention '{intention}': {len(selected_chunks)} total")
|
220 |
+
|
221 |
+
else:
|
222 |
+
# No specific intention - let LLM judge from best available chunks
|
223 |
+
all_chunks = emergency_chunks + treatment_chunks + symptom_chunks + diagnosis_chunks
|
224 |
+
|
225 |
+
# Sort by relevance (distance) and take top 6
|
226 |
+
all_chunks_sorted = sorted(all_chunks, key=lambda x: x.get("distance", 999))
|
227 |
+
selected_chunks = all_chunks_sorted[:6]
|
228 |
+
|
229 |
+
logger.info(f"Selected chunks by relevance (no intention): {len(selected_chunks)} total")
|
230 |
+
|
231 |
+
return selected_chunks
|
232 |
+
|
233 |
+
def _build_context_block(self, selected_chunks: List) -> str:
|
234 |
+
"""
|
235 |
+
Build formatted context block from selected chunks
|
236 |
+
|
237 |
+
Args:
|
238 |
+
selected_chunks: List of selected chunks
|
239 |
+
|
240 |
+
Returns:
|
241 |
+
Formatted context string for prompt
|
242 |
+
"""
|
243 |
+
if not selected_chunks:
|
244 |
+
return "No relevant medical guidelines found."
|
245 |
+
|
246 |
+
context_parts = []
|
247 |
+
|
248 |
+
for i, chunk in enumerate(selected_chunks, 1):
|
249 |
+
chunk_text = chunk.get("text", "").strip()
|
250 |
+
chunk_type = chunk.get("type", "unknown")
|
251 |
+
distance = chunk.get("distance", 0)
|
252 |
+
|
253 |
+
# Format each chunk with metadata
|
254 |
+
context_part = f"""
|
255 |
+
[Guideline {i}] (Source: {chunk_type.title()}, Relevance: {1-distance:.3f})
|
256 |
+
{chunk_text}
|
257 |
+
""".strip()
|
258 |
+
|
259 |
+
context_parts.append(context_part)
|
260 |
+
|
261 |
+
return "\n\n".join(context_parts)
|
262 |
+
|
263 |
+
def _construct_medical_prompt(self, user_query: str, context_block: str,
|
264 |
+
intention: Optional[str]) -> str:
|
265 |
+
"""
|
266 |
+
Construct final medical RAG prompt with appropriate framing
|
267 |
+
|
268 |
+
Args:
|
269 |
+
user_query: Original user query
|
270 |
+
context_block: Formatted context from selected chunks
|
271 |
+
intention: Query intention if detected
|
272 |
+
|
273 |
+
Returns:
|
274 |
+
Complete RAG prompt for Med42-70B
|
275 |
+
"""
|
276 |
+
# Customize prompt based on intention
|
277 |
+
if intention == "treatment":
|
278 |
+
focus_guidance = "Focus on providing specific treatment protocols, management steps, and therapeutic interventions."
|
279 |
+
elif intention == "diagnosis":
|
280 |
+
focus_guidance = "Focus on differential diagnosis, diagnostic criteria, and assessment approaches."
|
281 |
+
elif intention == "STAT(tentative)":
|
282 |
+
focus_guidance = "Focus on immediate emergency interventions and critical decision-making steps."
|
283 |
+
else:
|
284 |
+
focus_guidance = "Provide comprehensive medical guidance covering both diagnostic and treatment aspects as appropriate."
|
285 |
+
|
286 |
+
prompt = f"""You are an experienced attending physician providing guidance to a junior clinician in an emergency setting. A colleague is asking for your expert medical opinion.
|
287 |
+
|
288 |
+
Clinical Question:
|
289 |
+
{user_query}
|
290 |
+
|
291 |
+
Relevant Medical Guidelines:
|
292 |
+
{context_block}
|
293 |
+
|
294 |
+
Instructions:
|
295 |
+
{focus_guidance}
|
296 |
+
|
297 |
+
Please provide a clear, actionable response that:
|
298 |
+
1. Addresses the specific clinical question asked
|
299 |
+
2. References relevant evidence from the provided guidelines
|
300 |
+
3. Offers practical, step-by-step guidance when appropriate
|
301 |
+
4. Maintains appropriate medical caution and emphasizes the need for clinical judgment
|
302 |
+
|
303 |
+
Your response should be concise but comprehensive, suitable for immediate clinical application."""
|
304 |
+
|
305 |
+
return prompt
|
306 |
+
|
307 |
+
def _generate_with_med42(self, prompt: str) -> Dict[str, Any]:
|
308 |
+
"""
|
309 |
+
Generate medical advice using Med42-70B
|
310 |
+
|
311 |
+
Args:
|
312 |
+
prompt: Complete RAG prompt
|
313 |
+
|
314 |
+
Returns:
|
315 |
+
Generation result with metadata
|
316 |
+
"""
|
317 |
+
try:
|
318 |
+
logger.info("Calling Med42-70B for medical advice generation")
|
319 |
+
|
320 |
+
result = self.llm_client.analyze_medical_query(
|
321 |
+
query=prompt,
|
322 |
+
max_tokens=500, # Adjust based on needs
|
323 |
+
timeout=30.0 # Allow more time for complex medical advice
|
324 |
+
)
|
325 |
+
|
326 |
+
if result.get('error'):
|
327 |
+
raise Exception(f"Med42-70B generation error: {result['error']}")
|
328 |
+
|
329 |
+
return result
|
330 |
+
|
331 |
+
except Exception as e:
|
332 |
+
logger.error(f"Med42-70B generation failed: {e}")
|
333 |
+
raise
|
334 |
+
|
335 |
+
def _format_medical_response(self, user_query: str, generated_advice: Dict[str, Any],
|
336 |
+
chunks_used: Dict[str, List], intention: Optional[str],
|
337 |
+
processing_time: float) -> Dict[str, Any]:
|
338 |
+
"""
|
339 |
+
Format final medical response with metadata and confidence assessment
|
340 |
+
|
341 |
+
Args:
|
342 |
+
user_query: Original query
|
343 |
+
generated_advice: Result from Med42-70B
|
344 |
+
chunks_used: Classification of chunks used
|
345 |
+
intention: Detected intention
|
346 |
+
processing_time: Total processing time
|
347 |
+
|
348 |
+
Returns:
|
349 |
+
Structured medical advice response
|
350 |
+
"""
|
351 |
+
# Extract generated content
|
352 |
+
advice_content = generated_advice.get('extracted_condition', '')
|
353 |
+
if not advice_content:
|
354 |
+
advice_content = generated_advice.get('raw_response', 'Unable to generate medical advice.')
|
355 |
+
|
356 |
+
# Calculate confidence based on available factors
|
357 |
+
confidence_score = self._calculate_confidence_score(generated_advice, chunks_used)
|
358 |
+
|
359 |
+
# Count chunks used by source
|
360 |
+
chunk_counts = {source: len(chunks) for source, chunks in chunks_used.items()}
|
361 |
+
total_chunks = sum(chunk_counts.values())
|
362 |
+
|
363 |
+
formatted_response = {
|
364 |
+
"medical_advice": advice_content,
|
365 |
+
"confidence_score": confidence_score,
|
366 |
+
"query_metadata": {
|
367 |
+
"original_query": user_query,
|
368 |
+
"detected_intention": intention,
|
369 |
+
"processing_time_seconds": processing_time,
|
370 |
+
"total_chunks_used": total_chunks,
|
371 |
+
"chunks_by_source": chunk_counts
|
372 |
+
},
|
373 |
+
"generation_metadata": {
|
374 |
+
"model_used": "m42-health/Llama3-Med42-70B",
|
375 |
+
"generation_time": generated_advice.get('latency', 0),
|
376 |
+
"model_confidence": generated_advice.get('confidence', 'unknown'),
|
377 |
+
"timestamp": datetime.now().isoformat()
|
378 |
+
},
|
379 |
+
"sources": {
|
380 |
+
"emergency_sources": len(chunks_used.get("emergency_subset", [])),
|
381 |
+
"treatment_sources": len(chunks_used.get("treatment_subset", [])),
|
382 |
+
"total_sources": total_chunks
|
383 |
+
},
|
384 |
+
"disclaimer": "This advice is for informational purposes only and should not replace professional medical consultation. Always consult with qualified healthcare providers for medical decisions."
|
385 |
+
}
|
386 |
+
|
387 |
+
return formatted_response
|
388 |
+
|
389 |
+
def _calculate_confidence_score(self, generated_advice: Dict[str, Any],
|
390 |
+
chunks_used: Dict[str, List]) -> float:
|
391 |
+
"""
|
392 |
+
Calculate confidence score based on generation quality and source reliability
|
393 |
+
|
394 |
+
Args:
|
395 |
+
generated_advice: Result from Med42-70B
|
396 |
+
chunks_used: Chunks used in generation
|
397 |
+
|
398 |
+
Returns:
|
399 |
+
Confidence score between 0.0 and 1.0
|
400 |
+
"""
|
401 |
+
confidence_factors = []
|
402 |
+
|
403 |
+
# Factor 1: Model confidence if available
|
404 |
+
model_confidence = generated_advice.get('confidence', '0.5')
|
405 |
+
try:
|
406 |
+
model_conf_value = float(model_confidence)
|
407 |
+
confidence_factors.append(model_conf_value)
|
408 |
+
except (ValueError, TypeError):
|
409 |
+
confidence_factors.append(0.5) # Default neutral confidence
|
410 |
+
|
411 |
+
# Factor 2: Number of sources used (more sources = higher confidence)
|
412 |
+
total_chunks = sum(len(chunks) for chunks in chunks_used.values())
|
413 |
+
source_confidence = min(total_chunks / 6.0, 1.0) # Normalize to max 6 chunks
|
414 |
+
confidence_factors.append(source_confidence)
|
415 |
+
|
416 |
+
# Factor 3: Response length (reasonable length indicates comprehensive advice)
|
417 |
+
response_length = len(generated_advice.get('raw_response', ''))
|
418 |
+
length_confidence = min(response_length / 500.0, 1.0) # Normalize to ~500 chars
|
419 |
+
confidence_factors.append(length_confidence)
|
420 |
+
|
421 |
+
# Factor 4: Processing success (no errors = higher confidence)
|
422 |
+
if generated_advice.get('error'):
|
423 |
+
confidence_factors.append(0.3) # Lower confidence if errors occurred
|
424 |
+
else:
|
425 |
+
confidence_factors.append(0.8) # Higher confidence for clean generation
|
426 |
+
|
427 |
+
# Calculate weighted average
|
428 |
+
final_confidence = sum(confidence_factors) / len(confidence_factors)
|
429 |
+
|
430 |
+
# Ensure confidence is within valid range
|
431 |
+
return max(0.1, min(0.95, final_confidence))
|
432 |
+
|
433 |
+
def _generate_error_response(self, user_query: str, error_message: str) -> Dict[str, Any]:
|
434 |
+
"""
|
435 |
+
Generate error response when generation fails
|
436 |
+
|
437 |
+
Args:
|
438 |
+
user_query: Original query
|
439 |
+
error_message: Error details
|
440 |
+
|
441 |
+
Returns:
|
442 |
+
Error response in standard format
|
443 |
+
"""
|
444 |
+
return {
|
445 |
+
"medical_advice": "I apologize, but I encountered an error while processing your medical query. Please try rephrasing your question or contact technical support if the issue persists.",
|
446 |
+
"confidence_score": 0.0,
|
447 |
+
"query_metadata": {
|
448 |
+
"original_query": user_query,
|
449 |
+
"detected_intention": None,
|
450 |
+
"processing_time_seconds": 0.0,
|
451 |
+
"total_chunks_used": 0,
|
452 |
+
"chunks_by_source": {}
|
453 |
+
},
|
454 |
+
"generation_metadata": {
|
455 |
+
"model_used": "m42-health/Llama3-Med42-70B",
|
456 |
+
"error": error_message,
|
457 |
+
"timestamp": datetime.now().isoformat()
|
458 |
+
},
|
459 |
+
"sources": {
|
460 |
+
"emergency_sources": 0,
|
461 |
+
"treatment_sources": 0,
|
462 |
+
"total_sources": 0
|
463 |
+
},
|
464 |
+
"disclaimer": "This system experienced a technical error. Please consult with qualified healthcare providers for medical decisions."
|
465 |
+
}
|
466 |
+
|
467 |
+
# Example usage and testing
|
468 |
+
def main():
|
469 |
+
"""
|
470 |
+
Test the medical advice generation system
|
471 |
+
"""
|
472 |
+
# Initialize generator
|
473 |
+
generator = MedicalAdviceGenerator()
|
474 |
+
|
475 |
+
# Example retrieval results (simulated)
|
476 |
+
example_retrieval_results = {
|
477 |
+
"processed_results": [
|
478 |
+
{
|
479 |
+
"type": "emergency",
|
480 |
+
"distance": 0.3,
|
481 |
+
"text": "Acute myocardial infarction requires immediate assessment including ECG, cardiac enzymes, and chest X-ray. Time-sensitive condition requiring rapid intervention.",
|
482 |
+
"matched": "MI|chest pain"
|
483 |
+
},
|
484 |
+
{
|
485 |
+
"type": "treatment",
|
486 |
+
"distance": 0.25,
|
487 |
+
"text": "Treatment protocol for STEMI includes aspirin 325mg, clopidogrel loading dose, and urgent PCI within 90 minutes when available.",
|
488 |
+
"matched_treatment": "aspirin|PCI|thrombolytic"
|
489 |
+
}
|
490 |
+
]
|
491 |
+
}
|
492 |
+
|
493 |
+
# Test queries
|
494 |
+
test_queries = [
|
495 |
+
("How should I treat a patient with chest pain?", "treatment"),
|
496 |
+
("What are the signs of acute MI?", "diagnosis"),
|
497 |
+
# ("Emergency management of cardiac arrest", "STAT(tentative)")
|
498 |
+
]
|
499 |
+
|
500 |
+
for query, intention in test_queries:
|
501 |
+
print(f"\n{'='*60}")
|
502 |
+
print(f"Testing: {query}")
|
503 |
+
print(f"Intention: {intention}")
|
504 |
+
|
505 |
+
try:
|
506 |
+
result = generator.generate_medical_advice(
|
507 |
+
user_query=query,
|
508 |
+
retrieval_results=example_retrieval_results,
|
509 |
+
intention=intention
|
510 |
+
)
|
511 |
+
|
512 |
+
print(f"✅ Success: {result['confidence_score']:.2f} confidence")
|
513 |
+
print(f"Advice: {result['medical_advice'][:200]}...")
|
514 |
+
|
515 |
+
except Exception as e:
|
516 |
+
print(f"❌ Error: {e}")
|
517 |
+
|
518 |
+
if __name__ == "__main__":
|
519 |
+
main()
|
src/llm_clients.py
ADDED
@@ -0,0 +1,308 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
OnCall.ai LLM Clients Module
|
3 |
+
|
4 |
+
Provides specialized LLM clients for medical query processing.
|
5 |
+
|
6 |
+
Author: OnCall.ai Team
|
7 |
+
Date: 2025-07-29
|
8 |
+
"""
|
9 |
+
|
10 |
+
import logging
|
11 |
+
import os
|
12 |
+
from typing import Dict, Optional, Union
|
13 |
+
from huggingface_hub import InferenceClient
|
14 |
+
from dotenv import load_dotenv
|
15 |
+
|
16 |
+
# Load environment variables from .env file
|
17 |
+
load_dotenv()
|
18 |
+
|
19 |
+
class llm_Med42_70BClient:
|
20 |
+
def __init__(
|
21 |
+
self,
|
22 |
+
model_name: str = "m42-health/Llama3-Med42-70B",
|
23 |
+
timeout: float = 30.0
|
24 |
+
):
|
25 |
+
"""
|
26 |
+
Initialize Medical LLM client for query processing.
|
27 |
+
|
28 |
+
Args:
|
29 |
+
model_name: Hugging Face model name
|
30 |
+
timeout: API call timeout duration
|
31 |
+
|
32 |
+
Warning: This model should not be used for professional medical advice.
|
33 |
+
"""
|
34 |
+
self.logger = logging.getLogger(__name__)
|
35 |
+
self.timeout = timeout
|
36 |
+
|
37 |
+
# Configure logging to show detailed information
|
38 |
+
logging.basicConfig(
|
39 |
+
level=logging.INFO,
|
40 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
41 |
+
)
|
42 |
+
|
43 |
+
# Get Hugging Face token from environment
|
44 |
+
hf_token = os.getenv('HF_TOKEN')
|
45 |
+
if not hf_token:
|
46 |
+
self.logger.error("HF_TOKEN is missing from environment variables.")
|
47 |
+
raise ValueError(
|
48 |
+
"HF_TOKEN not found in environment variables. "
|
49 |
+
"Please set HF_TOKEN in your .env file or environment. "
|
50 |
+
"Ensure the token is not empty and is correctly set."
|
51 |
+
)
|
52 |
+
|
53 |
+
try:
|
54 |
+
# Initialize InferenceClient with the new model
|
55 |
+
self.client = InferenceClient(
|
56 |
+
provider="featherless-ai",
|
57 |
+
api_key=hf_token
|
58 |
+
)
|
59 |
+
|
60 |
+
self.logger.info(f"Medical LLM client initialized with model: {model_name}")
|
61 |
+
self.logger.warning(
|
62 |
+
"Medical LLM Model: Research tool only. "
|
63 |
+
"Not for professional medical diagnosis."
|
64 |
+
)
|
65 |
+
except Exception as e:
|
66 |
+
self.logger.error(f"Failed to initialize InferenceClient: {str(e)}")
|
67 |
+
self.logger.error(f"Error Type: {type(e).__name__}")
|
68 |
+
self.logger.error(f"Detailed Error: {repr(e)}")
|
69 |
+
raise ValueError(f"Failed to initialize Medical LLM client: {str(e)}") from e
|
70 |
+
|
71 |
+
def analyze_medical_query(
|
72 |
+
self,
|
73 |
+
query: str,
|
74 |
+
max_tokens: int = 100,
|
75 |
+
timeout: Optional[float] = None
|
76 |
+
) -> Dict[str, Union[str, float]]:
|
77 |
+
"""
|
78 |
+
Analyze medical query and extract condition.
|
79 |
+
|
80 |
+
Args:
|
81 |
+
query: Medical query text
|
82 |
+
max_tokens: Maximum tokens to generate
|
83 |
+
timeout: Specific API call timeout
|
84 |
+
|
85 |
+
Returns:
|
86 |
+
Extracted medical condition information with latency
|
87 |
+
"""
|
88 |
+
import time
|
89 |
+
|
90 |
+
# Start timing
|
91 |
+
start_time = time.time()
|
92 |
+
|
93 |
+
try:
|
94 |
+
self.logger.info(f"Calling Medical LLM with query: {query}")
|
95 |
+
|
96 |
+
# Prepare chat completion request with updated system prompt
|
97 |
+
response = self.client.chat.completions.create(
|
98 |
+
model="m42-health/Llama3-Med42-70B",
|
99 |
+
messages=[
|
100 |
+
{
|
101 |
+
"role": "system",
|
102 |
+
"content": """You are a medical assistant trained to extract medical conditions.
|
103 |
+
|
104 |
+
For medical queries: Extract the most representative medical condition name.
|
105 |
+
For non-medical queries: Respond with "NON_MEDICAL_QUERY" and briefly explain why it's not medical.
|
106 |
+
|
107 |
+
Examples:
|
108 |
+
- Medical: "chest pain" → "Acute Coronary Syndrome"
|
109 |
+
- Non-medical: "cooking pasta" → "NON_MEDICAL_QUERY. This is about culinary techniques, not medical conditions."
|
110 |
+
|
111 |
+
DO NOT provide medical advice."""
|
112 |
+
},
|
113 |
+
{
|
114 |
+
"role": "user",
|
115 |
+
"content": query
|
116 |
+
}
|
117 |
+
],
|
118 |
+
max_tokens=max_tokens
|
119 |
+
)
|
120 |
+
|
121 |
+
# Calculate latency
|
122 |
+
end_time = time.time()
|
123 |
+
latency = end_time - start_time
|
124 |
+
|
125 |
+
# Extract the response text
|
126 |
+
response_text = response.choices[0].message.content or ""
|
127 |
+
|
128 |
+
# Log raw response and latency
|
129 |
+
self.logger.info(f"Raw LLM Response: {response_text}")
|
130 |
+
self.logger.info(f"Query Latency: {latency:.4f} seconds")
|
131 |
+
|
132 |
+
# Extract condition from response
|
133 |
+
extracted_condition = self._extract_condition(response_text)
|
134 |
+
|
135 |
+
# Log the extracted condition
|
136 |
+
self.logger.info(f"Extracted Condition: {extracted_condition}")
|
137 |
+
|
138 |
+
return {
|
139 |
+
'extracted_condition': extracted_condition,
|
140 |
+
'confidence': '0.8',
|
141 |
+
'raw_response': response_text,
|
142 |
+
'latency': latency # Add latency to the return dictionary
|
143 |
+
}
|
144 |
+
|
145 |
+
except Exception as e:
|
146 |
+
# Calculate latency even for failed requests
|
147 |
+
end_time = time.time()
|
148 |
+
latency = end_time - start_time
|
149 |
+
|
150 |
+
self.logger.error(f"Medical LLM query error: {str(e)}")
|
151 |
+
self.logger.error(f"Error Type: {type(e).__name__}")
|
152 |
+
self.logger.error(f"Detailed Error: {repr(e)}")
|
153 |
+
self.logger.error(f"Query Latency (on error): {latency:.4f} seconds")
|
154 |
+
|
155 |
+
# Additional context logging
|
156 |
+
self.logger.error(f"Query that caused error: {query}")
|
157 |
+
|
158 |
+
return {
|
159 |
+
'extracted_condition': '',
|
160 |
+
'confidence': '0',
|
161 |
+
'error': str(e),
|
162 |
+
'latency': latency # Include latency even for error cases
|
163 |
+
}
|
164 |
+
|
165 |
+
def _extract_condition(self, response: str) -> str:
|
166 |
+
"""
|
167 |
+
Extract medical condition from model response.
|
168 |
+
|
169 |
+
Args:
|
170 |
+
response: Full model-generated text
|
171 |
+
|
172 |
+
Returns:
|
173 |
+
Extracted medical condition or empty string if non-medical
|
174 |
+
"""
|
175 |
+
# Check if this is a rejection response first
|
176 |
+
if self._is_rejection_response(response):
|
177 |
+
return ""
|
178 |
+
|
179 |
+
from medical_conditions import CONDITION_KEYWORD_MAPPING
|
180 |
+
|
181 |
+
# Search in known medical conditions
|
182 |
+
for condition in CONDITION_KEYWORD_MAPPING.keys():
|
183 |
+
if condition.lower() in response.lower():
|
184 |
+
return condition
|
185 |
+
|
186 |
+
return response.split('\n')[0].strip() or ""
|
187 |
+
|
188 |
+
def _is_rejection_response(self, response: str) -> bool:
|
189 |
+
"""
|
190 |
+
Dual-layer detection: prompt compliance + natural language patterns
|
191 |
+
|
192 |
+
Args:
|
193 |
+
response: LLM response text
|
194 |
+
|
195 |
+
Returns:
|
196 |
+
True if response indicates non-medical query rejection
|
197 |
+
"""
|
198 |
+
response_upper = response.upper()
|
199 |
+
response_lower = response.lower()
|
200 |
+
|
201 |
+
# Layer 1: Check for standardized format (if LLM follows prompt)
|
202 |
+
if "NON_MEDICAL_QUERY" in response_upper:
|
203 |
+
return True
|
204 |
+
|
205 |
+
# Layer 2: Check natural language rejection patterns (fallback)
|
206 |
+
rejection_patterns = [
|
207 |
+
"i do not address",
|
208 |
+
"do not address",
|
209 |
+
"outside my biomedical scope",
|
210 |
+
"outside my medical scope",
|
211 |
+
"unrelated to medical conditions",
|
212 |
+
"not about a medical condition",
|
213 |
+
"not a medical condition",
|
214 |
+
"this query is outside",
|
215 |
+
"culinary practice", # cooking-related
|
216 |
+
"technology trends", # programming-related
|
217 |
+
"meteorology", # weather-related
|
218 |
+
"non-medical context"
|
219 |
+
]
|
220 |
+
|
221 |
+
return any(pattern in response_lower for pattern in rejection_patterns)
|
222 |
+
|
223 |
+
def main():
|
224 |
+
"""
|
225 |
+
Test Medical LLM client functionality
|
226 |
+
"""
|
227 |
+
import time
|
228 |
+
from datetime import datetime
|
229 |
+
|
230 |
+
# Record total execution start time
|
231 |
+
total_start_time = time.time()
|
232 |
+
execution_start_timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
233 |
+
|
234 |
+
try:
|
235 |
+
print(f"Execution Started at: {execution_start_timestamp}")
|
236 |
+
|
237 |
+
# Test client initialization
|
238 |
+
client = llm_Med42_70BClient()
|
239 |
+
|
240 |
+
test_queries = [
|
241 |
+
"patient experiencing chest pain",
|
242 |
+
"sudden weakness on one side",
|
243 |
+
"severe headache with neurological symptoms"
|
244 |
+
]
|
245 |
+
|
246 |
+
# Store individual query results
|
247 |
+
query_results = []
|
248 |
+
|
249 |
+
for query in test_queries:
|
250 |
+
print(f"\nTesting query: {query}")
|
251 |
+
result = client.analyze_medical_query(query)
|
252 |
+
|
253 |
+
# Store query result
|
254 |
+
query_result = {
|
255 |
+
'query': query,
|
256 |
+
'extracted_condition': result.get('extracted_condition', 'N/A'),
|
257 |
+
'confidence': result.get('confidence', 'N/A'),
|
258 |
+
'latency': result.get('latency', 'N/A')
|
259 |
+
}
|
260 |
+
query_results.append(query_result)
|
261 |
+
|
262 |
+
# Print individual query results
|
263 |
+
print("Extracted Condition:", query_result['extracted_condition'])
|
264 |
+
print("Confidence:", query_result['confidence'])
|
265 |
+
print(f"Latency: {query_result['latency']:.4f} seconds")
|
266 |
+
|
267 |
+
if 'error' in result:
|
268 |
+
print("Error:", result['error'])
|
269 |
+
print("---")
|
270 |
+
|
271 |
+
# Calculate total execution time
|
272 |
+
total_end_time = time.time()
|
273 |
+
total_execution_time = total_end_time - total_start_time
|
274 |
+
execution_end_timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
275 |
+
|
276 |
+
# Print summary
|
277 |
+
print("\n--- Execution Summary ---")
|
278 |
+
print(f"Execution Started at: {execution_start_timestamp}")
|
279 |
+
print(f"Execution Ended at: {execution_end_timestamp}")
|
280 |
+
print(f"Total Execution Time: {total_execution_time:.4f} seconds")
|
281 |
+
|
282 |
+
# Optional: Return results for potential further processing
|
283 |
+
return {
|
284 |
+
'start_time': execution_start_timestamp,
|
285 |
+
'end_time': execution_end_timestamp,
|
286 |
+
'total_execution_time': total_execution_time,
|
287 |
+
'query_results': query_results
|
288 |
+
}
|
289 |
+
|
290 |
+
except Exception as e:
|
291 |
+
print(f"Client initialization error: {str(e)}")
|
292 |
+
print("Possible issues:")
|
293 |
+
print("1. Invalid or missing Hugging Face token")
|
294 |
+
print("2. Network connectivity problems")
|
295 |
+
print("3. Model access restrictions")
|
296 |
+
print("\nPlease check your .env file and Hugging Face token.")
|
297 |
+
|
298 |
+
# Calculate total execution time even in case of error
|
299 |
+
total_end_time = time.time()
|
300 |
+
total_execution_time = total_end_time - total_start_time
|
301 |
+
|
302 |
+
return {
|
303 |
+
'error': str(e),
|
304 |
+
'total_execution_time': total_execution_time
|
305 |
+
}
|
306 |
+
|
307 |
+
if __name__ == "__main__":
|
308 |
+
main()
|
src/medical_conditions.py
ADDED
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
OnCall.ai Medical Conditions Configuration
|
3 |
+
|
4 |
+
This module provides centralized configuration for:
|
5 |
+
1. Predefined medical conditions
|
6 |
+
2. Condition-to-keyword mappings
|
7 |
+
3. Fallback condition keywords
|
8 |
+
|
9 |
+
Author: OnCall.ai Team
|
10 |
+
Date: 2025-07-29
|
11 |
+
"""
|
12 |
+
|
13 |
+
from typing import Dict, Optional
|
14 |
+
|
15 |
+
# Comprehensive Condition-to-Keyword Mapping
|
16 |
+
CONDITION_KEYWORD_MAPPING: Dict[str, Dict[str, str]] = {
|
17 |
+
"acute myocardial infarction": {
|
18 |
+
"emergency": "MI|chest pain|cardiac arrest",
|
19 |
+
"treatment": "aspirin|nitroglycerin|thrombolytic|PCI"
|
20 |
+
},
|
21 |
+
"acute stroke": {
|
22 |
+
"emergency": "stroke|neurological deficit|sudden weakness",
|
23 |
+
"treatment": "tPA|thrombolysis|stroke unit care"
|
24 |
+
},
|
25 |
+
"pulmonary embolism": {
|
26 |
+
"emergency": "chest pain|shortness of breath|sudden dyspnea",
|
27 |
+
"treatment": "anticoagulation|heparin|embolectomy"
|
28 |
+
},
|
29 |
+
# extended from @20250729Test_Retrieval.md
|
30 |
+
"acute_ischemic_stroke": {
|
31 |
+
"emergency": "ischemic stroke|neurological deficit",
|
32 |
+
"treatment": "tPA|stroke unit management"
|
33 |
+
},
|
34 |
+
"hemorrhagic_stroke": {
|
35 |
+
"emergency": "hemorrhagic stroke|intracranial bleeding",
|
36 |
+
"treatment": "blood pressure control|neurosurgery"
|
37 |
+
},
|
38 |
+
"transient_ischemic_attack": {
|
39 |
+
"emergency": "TIA|temporary stroke symptoms",
|
40 |
+
"treatment": "antiplatelet|lifestyle modification"
|
41 |
+
},
|
42 |
+
"acute_coronary_syndrome": {
|
43 |
+
"emergency": "ACS|chest pain|ECG changes",
|
44 |
+
"treatment": "antiplatelet|statins|cardiac monitoring"
|
45 |
+
}
|
46 |
+
}
|
47 |
+
|
48 |
+
# Fallback Condition Keywords
|
49 |
+
FALLBACK_CONDITION_KEYWORDS: Dict[str, str] = {
|
50 |
+
"acute_ischemic_stroke": "acute ischemic stroke treatment",
|
51 |
+
"hemorrhagic_stroke": "hemorrhagic stroke management",
|
52 |
+
"transient_ischemic_attack": "TIA treatment protocol",
|
53 |
+
"acute_coronary_syndrome": "ACS treatment guidelines",
|
54 |
+
"stable_angina": "stable angina management",
|
55 |
+
"non_cardiac_chest_pain": "non-cardiac chest pain evaluation",
|
56 |
+
"witnessed_cardiac_arrest": "witnessed cardiac arrest protocol",
|
57 |
+
"unwitnessed_cardiac_arrest": "unwitnessed cardiac arrest management",
|
58 |
+
"post_resuscitation_care": "post-resuscitation care guidelines"
|
59 |
+
}
|
60 |
+
|
61 |
+
def get_condition_keywords(specific_condition: str) -> Optional[str]:
|
62 |
+
"""
|
63 |
+
Retrieve fallback keywords for a specific condition
|
64 |
+
|
65 |
+
Args:
|
66 |
+
specific_condition: Medical condition name
|
67 |
+
|
68 |
+
Returns:
|
69 |
+
Corresponding keywords or the original condition
|
70 |
+
"""
|
71 |
+
return FALLBACK_CONDITION_KEYWORDS.get(specific_condition, specific_condition)
|
72 |
+
|
73 |
+
def validate_condition(condition: str) -> bool:
|
74 |
+
"""
|
75 |
+
Check if a condition exists in our predefined mapping
|
76 |
+
|
77 |
+
Args:
|
78 |
+
condition: Medical condition to validate
|
79 |
+
|
80 |
+
Returns:
|
81 |
+
Boolean indicating condition validity
|
82 |
+
"""
|
83 |
+
return condition.lower() in {k.lower() for k in CONDITION_KEYWORD_MAPPING.keys()}
|
84 |
+
|
85 |
+
def get_condition_details(condition: str) -> Optional[Dict[str, str]]:
|
86 |
+
"""
|
87 |
+
Retrieve detailed information for a specific condition
|
88 |
+
|
89 |
+
Args:
|
90 |
+
condition: Medical condition name
|
91 |
+
|
92 |
+
Returns:
|
93 |
+
Dict with emergency and treatment keywords, or None
|
94 |
+
"""
|
95 |
+
normalized_condition = condition.lower()
|
96 |
+
for key, value in CONDITION_KEYWORD_MAPPING.items():
|
97 |
+
if key.lower() == normalized_condition:
|
98 |
+
return value
|
99 |
+
return None
|
src/retrieval.py
ADDED
@@ -0,0 +1,391 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Basic Retrieval System for OnCall.ai
|
3 |
+
|
4 |
+
This module implements the core vector retrieval functionality:
|
5 |
+
- Basic vector search
|
6 |
+
- Source marking
|
7 |
+
- Unified output format
|
8 |
+
"""
|
9 |
+
|
10 |
+
import numpy as np
|
11 |
+
import json
|
12 |
+
from pathlib import Path
|
13 |
+
from typing import Dict, List, Tuple, Any, Optional
|
14 |
+
from sentence_transformers import SentenceTransformer
|
15 |
+
from annoy import AnnoyIndex
|
16 |
+
import logging
|
17 |
+
|
18 |
+
# Configure logging
|
19 |
+
logging.basicConfig(
|
20 |
+
level=logging.INFO,
|
21 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
22 |
+
)
|
23 |
+
logger = logging.getLogger(__name__)
|
24 |
+
|
25 |
+
class BasicRetrievalSystem:
|
26 |
+
"""Basic vector retrieval system for medical documents"""
|
27 |
+
|
28 |
+
def __init__(self, embedding_dim: int = 768):
|
29 |
+
"""
|
30 |
+
Initialize the retrieval system
|
31 |
+
|
32 |
+
Args:
|
33 |
+
embedding_dim: Dimension of embeddings (default: 768 for PubMedBERT)
|
34 |
+
"""
|
35 |
+
self.embedding_dim = embedding_dim
|
36 |
+
self.embedding_model = None
|
37 |
+
self.emergency_index = None
|
38 |
+
self.treatment_index = None
|
39 |
+
self.emergency_chunks = {}
|
40 |
+
self.treatment_chunks = {}
|
41 |
+
|
42 |
+
# Initialize system
|
43 |
+
self._initialize_system()
|
44 |
+
|
45 |
+
def _initialize_system(self) -> None:
|
46 |
+
"""Initialize embeddings, indices and chunks"""
|
47 |
+
try:
|
48 |
+
logger.info("Initializing retrieval system...")
|
49 |
+
|
50 |
+
# Initialize embedding model
|
51 |
+
self.embedding_model = SentenceTransformer("NeuML/pubmedbert-base-embeddings")
|
52 |
+
logger.info("Embedding model loaded successfully")
|
53 |
+
|
54 |
+
# Initialize Annoy indices
|
55 |
+
self.emergency_index = AnnoyIndex(self.embedding_dim, 'angular')
|
56 |
+
self.treatment_index = AnnoyIndex(self.embedding_dim, 'angular')
|
57 |
+
|
58 |
+
# Load data
|
59 |
+
current_file = Path(__file__)
|
60 |
+
project_root = current_file.parent.parent # from src to root
|
61 |
+
base_path = project_root / "models"
|
62 |
+
self._load_chunks(base_path)
|
63 |
+
self._load_embeddings(base_path)
|
64 |
+
self._build_or_load_indices(base_path)
|
65 |
+
|
66 |
+
logger.info("Retrieval system initialized successfully")
|
67 |
+
|
68 |
+
except Exception as e:
|
69 |
+
logger.error(f"Failed to initialize retrieval system: {e}")
|
70 |
+
raise
|
71 |
+
|
72 |
+
def _load_chunks(self, base_path: Path) -> None:
|
73 |
+
"""Load chunk data from JSON files"""
|
74 |
+
try:
|
75 |
+
# Load emergency chunks
|
76 |
+
with open(base_path / "embeddings" / "emergency_chunks.json", 'r') as f:
|
77 |
+
self.emergency_chunks = json.load(f)
|
78 |
+
|
79 |
+
# Load treatment chunks
|
80 |
+
with open(base_path / "embeddings" / "treatment_chunks.json", 'r') as f:
|
81 |
+
self.treatment_chunks = json.load(f)
|
82 |
+
|
83 |
+
logger.info("Chunks loaded successfully")
|
84 |
+
|
85 |
+
except FileNotFoundError as e:
|
86 |
+
logger.error(f"Chunk file not found: {e}")
|
87 |
+
raise
|
88 |
+
except json.JSONDecodeError as e:
|
89 |
+
logger.error(f"Invalid JSON in chunk file: {e}")
|
90 |
+
raise
|
91 |
+
|
92 |
+
def _load_embeddings(self, base_path: Path) -> None:
|
93 |
+
"""Load pre-computed embeddings"""
|
94 |
+
try:
|
95 |
+
# Load emergency embeddings
|
96 |
+
self.emergency_embeddings = np.load(
|
97 |
+
base_path / "embeddings" / "emergency_embeddings.npy"
|
98 |
+
)
|
99 |
+
|
100 |
+
# Load treatment embeddings
|
101 |
+
self.treatment_embeddings = np.load(
|
102 |
+
base_path / "embeddings" / "treatment_embeddings.npy"
|
103 |
+
)
|
104 |
+
|
105 |
+
logger.info("Embeddings loaded successfully")
|
106 |
+
|
107 |
+
except Exception as e:
|
108 |
+
logger.error(f"Failed to load embeddings: {e}")
|
109 |
+
raise
|
110 |
+
|
111 |
+
def _build_or_load_indices(self, base_path: Path) -> None:
|
112 |
+
"""Build or load Annoy indices"""
|
113 |
+
indices_path = base_path / "indices" / "annoy"
|
114 |
+
emergency_index_path = indices_path / "emergency.ann"
|
115 |
+
treatment_index_path = indices_path / "treatment.ann"
|
116 |
+
|
117 |
+
try:
|
118 |
+
# Emergency index
|
119 |
+
if emergency_index_path.exists():
|
120 |
+
self.emergency_index.load(str(emergency_index_path))
|
121 |
+
logger.info("Loaded existing emergency index")
|
122 |
+
else:
|
123 |
+
self._build_index(
|
124 |
+
self.emergency_embeddings,
|
125 |
+
self.emergency_index,
|
126 |
+
emergency_index_path
|
127 |
+
)
|
128 |
+
logger.info("Built new emergency index")
|
129 |
+
|
130 |
+
# Treatment index
|
131 |
+
if treatment_index_path.exists():
|
132 |
+
self.treatment_index.load(str(treatment_index_path))
|
133 |
+
logger.info("Loaded existing treatment index")
|
134 |
+
else:
|
135 |
+
self._build_index(
|
136 |
+
self.treatment_embeddings,
|
137 |
+
self.treatment_index,
|
138 |
+
treatment_index_path
|
139 |
+
)
|
140 |
+
logger.info("Built new treatment index")
|
141 |
+
|
142 |
+
except Exception as e:
|
143 |
+
logger.error(f"Failed to build/load indices: {e}")
|
144 |
+
raise
|
145 |
+
|
146 |
+
def _build_index(self, embeddings: np.ndarray, index: AnnoyIndex,
|
147 |
+
save_path: Path, n_trees: int = 15) -> None:
|
148 |
+
"""
|
149 |
+
Build and save Annoy index
|
150 |
+
|
151 |
+
Args:
|
152 |
+
embeddings: Embedding vectors
|
153 |
+
index: AnnoyIndex instance
|
154 |
+
save_path: Path to save the index
|
155 |
+
n_trees: Number of trees for Annoy index (default: 15)
|
156 |
+
"""
|
157 |
+
try:
|
158 |
+
for i, vec in enumerate(embeddings):
|
159 |
+
index.add_item(i, vec)
|
160 |
+
index.build(n_trees)
|
161 |
+
save_path.parent.mkdir(parents=True, exist_ok=True)
|
162 |
+
index.save(str(save_path))
|
163 |
+
|
164 |
+
except Exception as e:
|
165 |
+
logger.error(f"Failed to build index: {e}")
|
166 |
+
raise
|
167 |
+
|
168 |
+
def search(self, query: str, top_k: int = 5) -> Dict[str, Any]:
|
169 |
+
"""
|
170 |
+
Perform vector search on both indices
|
171 |
+
|
172 |
+
Args:
|
173 |
+
query: Search query
|
174 |
+
top_k: Number of results to return from each index
|
175 |
+
|
176 |
+
Returns:
|
177 |
+
Dict containing search results and metadata
|
178 |
+
"""
|
179 |
+
try:
|
180 |
+
# Get query embedding
|
181 |
+
query_embedding = self.embedding_model.encode([query])[0]
|
182 |
+
|
183 |
+
# Search both indices
|
184 |
+
emergency_results = self._search_index(
|
185 |
+
query_embedding,
|
186 |
+
self.emergency_index,
|
187 |
+
self.emergency_chunks,
|
188 |
+
"emergency",
|
189 |
+
top_k
|
190 |
+
)
|
191 |
+
|
192 |
+
treatment_results = self._search_index(
|
193 |
+
query_embedding,
|
194 |
+
self.treatment_index,
|
195 |
+
self.treatment_chunks,
|
196 |
+
"treatment",
|
197 |
+
top_k
|
198 |
+
)
|
199 |
+
|
200 |
+
# Log individual index results
|
201 |
+
logger.info(f"Search results: Emergency={len(emergency_results)}, Treatment={len(treatment_results)}")
|
202 |
+
|
203 |
+
results = {
|
204 |
+
"query": query,
|
205 |
+
"emergency_results": emergency_results,
|
206 |
+
"treatment_results": treatment_results,
|
207 |
+
"total_results": len(emergency_results) + len(treatment_results)
|
208 |
+
}
|
209 |
+
|
210 |
+
# Post-process results
|
211 |
+
processed_results = self.post_process_results(results)
|
212 |
+
|
213 |
+
return processed_results
|
214 |
+
|
215 |
+
except Exception as e:
|
216 |
+
logger.error(f"Search failed: {e}")
|
217 |
+
raise
|
218 |
+
|
219 |
+
def _search_index(self, query_embedding: np.ndarray, index: AnnoyIndex,
|
220 |
+
chunks: Dict, source_type: str, top_k: int) -> List[Dict]:
|
221 |
+
"""
|
222 |
+
Search a single index and format results
|
223 |
+
|
224 |
+
Args:
|
225 |
+
query_embedding: Query vector
|
226 |
+
index: AnnoyIndex to search
|
227 |
+
chunks: Chunk data
|
228 |
+
source_type: Type of source ("emergency" or "treatment")
|
229 |
+
top_k: Number of results to return
|
230 |
+
|
231 |
+
Returns:
|
232 |
+
List of formatted results
|
233 |
+
"""
|
234 |
+
# Get nearest neighbors
|
235 |
+
indices, distances = index.get_nns_by_vector(
|
236 |
+
query_embedding, top_k, include_distances=True
|
237 |
+
)
|
238 |
+
|
239 |
+
# Format results
|
240 |
+
results = []
|
241 |
+
for idx, distance in zip(indices, distances):
|
242 |
+
chunk_data = chunks[idx] # chunks is a list, use integer index directly
|
243 |
+
result = {
|
244 |
+
"type": source_type, # Using 'type' to match metadata
|
245 |
+
"chunk_id": idx,
|
246 |
+
"distance": distance,
|
247 |
+
"text": chunk_data.get("text", ""),
|
248 |
+
"matched": chunk_data.get("matched", ""),
|
249 |
+
"matched_treatment": chunk_data.get("matched_treatment", "")
|
250 |
+
}
|
251 |
+
results.append(result)
|
252 |
+
|
253 |
+
return results
|
254 |
+
|
255 |
+
def post_process_results(self, results: Dict[str, Any]) -> Dict[str, Any]:
|
256 |
+
"""
|
257 |
+
Post-process search results
|
258 |
+
- Remove duplicates
|
259 |
+
- Sort by distance
|
260 |
+
- Add metadata enrichment
|
261 |
+
|
262 |
+
Args:
|
263 |
+
results: Raw search results
|
264 |
+
|
265 |
+
Returns:
|
266 |
+
Processed results
|
267 |
+
"""
|
268 |
+
try:
|
269 |
+
emergency_results = results["emergency_results"]
|
270 |
+
treatment_results = results["treatment_results"]
|
271 |
+
|
272 |
+
# Combine all results
|
273 |
+
all_results = emergency_results + treatment_results
|
274 |
+
|
275 |
+
# Remove duplicates based on exact text matching
|
276 |
+
unique_results = self._remove_duplicates(all_results)
|
277 |
+
|
278 |
+
# Sort by distance
|
279 |
+
sorted_results = sorted(unique_results, key=lambda x: x["distance"])
|
280 |
+
|
281 |
+
return {
|
282 |
+
"query": results["query"],
|
283 |
+
"processed_results": sorted_results,
|
284 |
+
"total_results": len(sorted_results),
|
285 |
+
"processing_info": {
|
286 |
+
"duplicates_removed": len(all_results) - len(unique_results)
|
287 |
+
}
|
288 |
+
}
|
289 |
+
|
290 |
+
except Exception as e:
|
291 |
+
logger.error(f"Post-processing failed: {e}")
|
292 |
+
raise
|
293 |
+
|
294 |
+
def _remove_duplicates(self, results: List[Dict]) -> List[Dict]:
|
295 |
+
"""
|
296 |
+
Remove duplicate results based on exact text matching
|
297 |
+
|
298 |
+
Args:
|
299 |
+
results: List of search results
|
300 |
+
|
301 |
+
Returns:
|
302 |
+
Deduplicated results with logging statistics
|
303 |
+
"""
|
304 |
+
original_count = len(results)
|
305 |
+
seen_texts = set()
|
306 |
+
unique_results = []
|
307 |
+
|
308 |
+
# Sort results by distance (ascending) to keep best matches
|
309 |
+
sorted_results = sorted(results, key=lambda x: x["distance"])
|
310 |
+
|
311 |
+
logger.info(f"Deduplication: Processing {original_count} results using text matching")
|
312 |
+
|
313 |
+
for result in sorted_results:
|
314 |
+
text = result["text"]
|
315 |
+
if text not in seen_texts:
|
316 |
+
seen_texts.add(text)
|
317 |
+
unique_results.append(result)
|
318 |
+
else:
|
319 |
+
logger.debug(f"Skipping duplicate text: {text[:50]}...")
|
320 |
+
|
321 |
+
final_count = len(unique_results)
|
322 |
+
logger.info(f"Deduplication summary: {original_count} → {final_count} results (removed {original_count - final_count})")
|
323 |
+
|
324 |
+
return unique_results
|
325 |
+
|
326 |
+
def search_sliding_window_chunks(self, query: str, top_k: int = 5, window_size: int = 256, overlap: int = 64) -> List[Dict[str, Any]]:
|
327 |
+
"""
|
328 |
+
Perform semantic search using sliding window chunks
|
329 |
+
|
330 |
+
Args:
|
331 |
+
query: Search query
|
332 |
+
top_k: Number of top results to return
|
333 |
+
window_size: Size of sliding window chunks
|
334 |
+
overlap: Overlap between sliding windows
|
335 |
+
|
336 |
+
Returns:
|
337 |
+
List of search results with sliding window chunks
|
338 |
+
"""
|
339 |
+
try:
|
340 |
+
# Get query embedding
|
341 |
+
query_embedding = self.embedding_model.encode([query])[0]
|
342 |
+
|
343 |
+
# Combine emergency and treatment chunks
|
344 |
+
all_chunks = self.emergency_chunks + self.treatment_chunks
|
345 |
+
all_embeddings = np.vstack([self.emergency_embeddings, self.treatment_embeddings])
|
346 |
+
|
347 |
+
# Compute cosine similarities
|
348 |
+
similarities = [
|
349 |
+
np.dot(query_embedding, chunk_emb) /
|
350 |
+
(np.linalg.norm(query_embedding) * np.linalg.norm(chunk_emb))
|
351 |
+
for chunk_emb in all_embeddings
|
352 |
+
]
|
353 |
+
|
354 |
+
# Sort results by similarity
|
355 |
+
sorted_indices = np.argsort(similarities)[::-1]
|
356 |
+
|
357 |
+
# Prepare results
|
358 |
+
results = []
|
359 |
+
for idx in sorted_indices[:top_k]:
|
360 |
+
chunk = all_chunks[idx]
|
361 |
+
result = {
|
362 |
+
'text': chunk.get('text', ''),
|
363 |
+
'distance': similarities[idx],
|
364 |
+
'type': 'emergency' if idx < len(self.emergency_chunks) else 'treatment'
|
365 |
+
}
|
366 |
+
results.append(result)
|
367 |
+
|
368 |
+
logger.info(f"Sliding window search: Found {len(results)} results")
|
369 |
+
return results
|
370 |
+
|
371 |
+
except Exception as e:
|
372 |
+
logger.error(f"Sliding window search failed: {e}")
|
373 |
+
return []
|
374 |
+
|
375 |
+
def search_generic_medical_content(self, query: str, top_k: int = 5) -> List[Dict]:
|
376 |
+
"""
|
377 |
+
Perform generic medical content search
|
378 |
+
|
379 |
+
Args:
|
380 |
+
query: Search query
|
381 |
+
top_k: Number of top results to return
|
382 |
+
|
383 |
+
Returns:
|
384 |
+
List of search results
|
385 |
+
"""
|
386 |
+
try:
|
387 |
+
# re-use search_sliding_window_chunks method
|
388 |
+
return self.search_sliding_window_chunks(query, top_k=top_k)
|
389 |
+
except Exception as e:
|
390 |
+
logger.error(f"Generic medical content search error: {e}")
|
391 |
+
return []
|
src/user_prompt.py
ADDED
@@ -0,0 +1,562 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
OnCall.ai User Prompt Processing Module
|
3 |
+
|
4 |
+
This module handles:
|
5 |
+
1. Condition extraction from user queries
|
6 |
+
2. Keyword mapping
|
7 |
+
3. User confirmation workflow
|
8 |
+
4. Fallback mechanisms
|
9 |
+
|
10 |
+
Author: OnCall.ai Team
|
11 |
+
Date: 2025-07-29
|
12 |
+
"""
|
13 |
+
|
14 |
+
import logging
|
15 |
+
from typing import Dict, Optional, Any, List
|
16 |
+
from sentence_transformers import SentenceTransformer
|
17 |
+
import numpy as np # Added missing import for numpy
|
18 |
+
import os # Added missing import for os
|
19 |
+
import json # Added missing import for json
|
20 |
+
import re # Added missing import for re
|
21 |
+
|
22 |
+
# Import our centralized medical conditions configuration
|
23 |
+
from medical_conditions import (
|
24 |
+
CONDITION_KEYWORD_MAPPING,
|
25 |
+
get_condition_details,
|
26 |
+
validate_condition
|
27 |
+
)
|
28 |
+
|
29 |
+
# Configure logging
|
30 |
+
logging.basicConfig(
|
31 |
+
level=logging.INFO,
|
32 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
33 |
+
)
|
34 |
+
logger = logging.getLogger(__name__)
|
35 |
+
|
36 |
+
class UserPromptProcessor:
|
37 |
+
def __init__(self, llm_client=None, retrieval_system=None):
|
38 |
+
"""
|
39 |
+
Initialize UserPromptProcessor with optional LLM and retrieval system
|
40 |
+
|
41 |
+
Args:
|
42 |
+
llm_client: Optional Llama3-Med42-70B client for advanced condition extraction
|
43 |
+
retrieval_system: Optional retrieval system for semantic search
|
44 |
+
"""
|
45 |
+
self.llm_client = llm_client
|
46 |
+
self.retrieval_system = retrieval_system
|
47 |
+
self.embedding_model = SentenceTransformer("NeuML/pubmedbert-base-embeddings")
|
48 |
+
|
49 |
+
# Add embeddings directory path
|
50 |
+
self.embeddings_dir = os.path.join(os.path.dirname(__file__), '..', 'models', 'embeddings')
|
51 |
+
|
52 |
+
logger.info("UserPromptProcessor initialized")
|
53 |
+
|
54 |
+
def extract_condition_keywords(self, user_query: str) -> Dict[str, str]:
|
55 |
+
"""
|
56 |
+
Extract condition keywords with multi-level fallback
|
57 |
+
|
58 |
+
Args:
|
59 |
+
user_query: User's medical query
|
60 |
+
|
61 |
+
Returns:
|
62 |
+
Dict with condition and keywords
|
63 |
+
"""
|
64 |
+
|
65 |
+
# Level 1: Predefined Mapping (Fast Path)
|
66 |
+
predefined_result = self._predefined_mapping(user_query)
|
67 |
+
if predefined_result:
|
68 |
+
return predefined_result
|
69 |
+
|
70 |
+
# Level 2: Llama3-Med42-70B Extraction (if available)
|
71 |
+
if self.llm_client:
|
72 |
+
llm_result = self._extract_with_llm(user_query)
|
73 |
+
if llm_result:
|
74 |
+
return llm_result
|
75 |
+
|
76 |
+
# Level 3: Semantic Search Fallback
|
77 |
+
semantic_result = self._semantic_search_fallback(user_query)
|
78 |
+
if semantic_result:
|
79 |
+
return semantic_result
|
80 |
+
|
81 |
+
# Level 4: Medical Query Validation
|
82 |
+
# Only validate if previous levels failed - speed optimization
|
83 |
+
validation_result = self.validate_medical_query(user_query)
|
84 |
+
if validation_result: # If validation fails (returns non-None)
|
85 |
+
return validation_result
|
86 |
+
|
87 |
+
# Level 5: Generic Medical Search (after validation passes)
|
88 |
+
generic_result = self._generic_medical_search(user_query)
|
89 |
+
if generic_result:
|
90 |
+
return generic_result
|
91 |
+
|
92 |
+
# No match found
|
93 |
+
|
94 |
+
return {
|
95 |
+
'condition': '',
|
96 |
+
'emergency_keywords': '',
|
97 |
+
'treatment_keywords': ''
|
98 |
+
}
|
99 |
+
|
100 |
+
def _predefined_mapping(self, user_query: str) -> Optional[Dict[str, str]]:
|
101 |
+
"""
|
102 |
+
Fast predefined condition mapping
|
103 |
+
|
104 |
+
Args:
|
105 |
+
user_query: User's medical query
|
106 |
+
|
107 |
+
Returns:
|
108 |
+
Mapped condition keywords or None
|
109 |
+
"""
|
110 |
+
query_lower = user_query.lower()
|
111 |
+
|
112 |
+
for condition, mappings in CONDITION_KEYWORD_MAPPING.items():
|
113 |
+
if condition.lower() in query_lower:
|
114 |
+
logger.info(f"Matched predefined condition: {condition}")
|
115 |
+
return {
|
116 |
+
'condition': condition,
|
117 |
+
'emergency_keywords': mappings['emergency'],
|
118 |
+
'treatment_keywords': mappings['treatment']
|
119 |
+
}
|
120 |
+
|
121 |
+
return None
|
122 |
+
|
123 |
+
def _extract_with_llm(self, user_query: str) -> Optional[Dict[str, str]]:
|
124 |
+
"""
|
125 |
+
Use Llama3-Med42-70B for advanced condition extraction
|
126 |
+
|
127 |
+
Args:
|
128 |
+
user_query: User's medical query
|
129 |
+
|
130 |
+
Returns:
|
131 |
+
Dict with condition and keywords, or None
|
132 |
+
"""
|
133 |
+
if not self.llm_client:
|
134 |
+
return None
|
135 |
+
|
136 |
+
try:
|
137 |
+
llama_response = self.llm_client.analyze_medical_query(
|
138 |
+
query=user_query,
|
139 |
+
max_tokens=100,
|
140 |
+
timeout=2.0
|
141 |
+
)
|
142 |
+
|
143 |
+
extracted_condition = llama_response.get('extracted_condition', '')
|
144 |
+
|
145 |
+
if extracted_condition and validate_condition(extracted_condition):
|
146 |
+
condition_details = get_condition_details(extracted_condition)
|
147 |
+
if condition_details:
|
148 |
+
return {
|
149 |
+
'condition': extracted_condition,
|
150 |
+
'emergency_keywords': condition_details.get('emergency', ''),
|
151 |
+
'treatment_keywords': condition_details.get('treatment', '')
|
152 |
+
}
|
153 |
+
|
154 |
+
return None
|
155 |
+
|
156 |
+
except Exception as e:
|
157 |
+
logger.error(f"Llama3-Med42-70B condition extraction error: {e}")
|
158 |
+
return None
|
159 |
+
|
160 |
+
def _semantic_search_fallback(self, user_query: str) -> Optional[Dict[str, str]]:
|
161 |
+
"""
|
162 |
+
Perform semantic search for condition extraction using sliding window chunks
|
163 |
+
|
164 |
+
Args:
|
165 |
+
user_query: User's medical query
|
166 |
+
|
167 |
+
Returns:
|
168 |
+
Dict with condition and keywords, or None
|
169 |
+
"""
|
170 |
+
logger.info(f"Starting semantic search fallback for query: '{user_query}'")
|
171 |
+
|
172 |
+
if not self.retrieval_system:
|
173 |
+
logger.warning("No retrieval system available for semantic search")
|
174 |
+
return None
|
175 |
+
|
176 |
+
try:
|
177 |
+
# Perform semantic search on sliding window chunks
|
178 |
+
semantic_results = self.retrieval_system.search_sliding_window_chunks(user_query)
|
179 |
+
|
180 |
+
logger.info(f"Semantic search returned {len(semantic_results)} results")
|
181 |
+
|
182 |
+
if semantic_results:
|
183 |
+
# Extract condition from top semantic result
|
184 |
+
top_result = semantic_results[0]
|
185 |
+
condition = self._infer_condition_from_text(top_result['text'])
|
186 |
+
|
187 |
+
logger.info(f"Inferred condition: {condition}")
|
188 |
+
|
189 |
+
if condition and validate_condition(condition):
|
190 |
+
condition_details = get_condition_details(condition)
|
191 |
+
if condition_details:
|
192 |
+
result = {
|
193 |
+
'condition': condition,
|
194 |
+
'emergency_keywords': condition_details.get('emergency', ''),
|
195 |
+
'treatment_keywords': condition_details.get('treatment', ''),
|
196 |
+
'semantic_confidence': top_result.get('distance', 0)
|
197 |
+
}
|
198 |
+
|
199 |
+
logger.info(f"Semantic search successful. Condition: {condition}, "
|
200 |
+
f"Confidence: {result['semantic_confidence']}")
|
201 |
+
return result
|
202 |
+
else:
|
203 |
+
logger.warning(f"Condition validation failed for: {condition}")
|
204 |
+
|
205 |
+
logger.info("No suitable condition found in semantic search")
|
206 |
+
return None
|
207 |
+
|
208 |
+
except Exception as e:
|
209 |
+
logger.error(f"Semantic search fallback error: {e}", exc_info=True)
|
210 |
+
return None
|
211 |
+
|
212 |
+
def _generic_medical_search(self, user_query: str) -> Optional[Dict[str, str]]:
|
213 |
+
"""
|
214 |
+
Perform generic medical search as final fallback
|
215 |
+
|
216 |
+
Args:
|
217 |
+
user_query: User's medical query
|
218 |
+
|
219 |
+
Returns:
|
220 |
+
Dict with generic medical keywords
|
221 |
+
"""
|
222 |
+
generic_medical_terms = [
|
223 |
+
"medical", "treatment", "management", "protocol",
|
224 |
+
"guidelines", "emergency", "acute", "chronic"
|
225 |
+
]
|
226 |
+
|
227 |
+
generic_query = f"{user_query} medical treatment"
|
228 |
+
|
229 |
+
try:
|
230 |
+
# Perform generic medical search
|
231 |
+
generic_results = self.retrieval_system.search_generic_medical_content(generic_query)
|
232 |
+
|
233 |
+
if generic_results:
|
234 |
+
return {
|
235 |
+
'condition': 'generic medical query',
|
236 |
+
'emergency_keywords': 'medical|emergency',
|
237 |
+
'treatment_keywords': 'treatment|management',
|
238 |
+
'generic_confidence': 0.5
|
239 |
+
}
|
240 |
+
|
241 |
+
return None
|
242 |
+
except Exception as e:
|
243 |
+
logger.error(f"Generic medical search error: {e}")
|
244 |
+
return None
|
245 |
+
|
246 |
+
def _infer_condition_from_text(self, text: str) -> Optional[str]:
|
247 |
+
"""
|
248 |
+
Infer medical condition from text using embedding similarity
|
249 |
+
|
250 |
+
Args:
|
251 |
+
text: Input medical text
|
252 |
+
|
253 |
+
Returns:
|
254 |
+
Inferred condition or None
|
255 |
+
"""
|
256 |
+
# Implement a simple condition inference using embedding similarity
|
257 |
+
# This is a placeholder and would need more sophisticated implementation
|
258 |
+
conditions = list(CONDITION_KEYWORD_MAPPING.keys())
|
259 |
+
text_embedding = self.embedding_model.encode(text)
|
260 |
+
condition_embeddings = [self.embedding_model.encode(condition) for condition in conditions]
|
261 |
+
|
262 |
+
similarities = [
|
263 |
+
np.dot(text_embedding, condition_emb) /
|
264 |
+
(np.linalg.norm(text_embedding) * np.linalg.norm(condition_emb))
|
265 |
+
for condition_emb in condition_embeddings
|
266 |
+
]
|
267 |
+
|
268 |
+
max_similarity_index = np.argmax(similarities)
|
269 |
+
return conditions[max_similarity_index] if similarities[max_similarity_index] > 0.7 else None
|
270 |
+
|
271 |
+
def validate_keywords(self, keywords: Dict[str, str]) -> bool:
|
272 |
+
"""
|
273 |
+
Validate if extracted keywords exist in our medical indices
|
274 |
+
|
275 |
+
Args:
|
276 |
+
keywords: Dict of emergency and treatment keywords
|
277 |
+
|
278 |
+
Returns:
|
279 |
+
Boolean indicating keyword validity
|
280 |
+
"""
|
281 |
+
emergency_kws = keywords.get('emergency_keywords', '').split('|')
|
282 |
+
treatment_kws = keywords.get('treatment_keywords', '').split('|')
|
283 |
+
|
284 |
+
# Basic validation: check if any keyword is non-empty
|
285 |
+
return any(kw.strip() for kw in emergency_kws + treatment_kws)
|
286 |
+
|
287 |
+
def _check_keyword_in_index(self, keyword: str, index_type: str) -> bool:
|
288 |
+
"""
|
289 |
+
Check if a keyword exists in the specified medical index
|
290 |
+
|
291 |
+
Args:
|
292 |
+
keyword: Keyword to check
|
293 |
+
index_type: Type of index ('emergency' or 'treatment')
|
294 |
+
|
295 |
+
Returns:
|
296 |
+
Boolean indicating keyword existence in the index
|
297 |
+
"""
|
298 |
+
# Validate input parameters
|
299 |
+
if not keyword or not index_type:
|
300 |
+
logger.warning(f"Invalid input: keyword='{keyword}', index_type='{index_type}'")
|
301 |
+
return False
|
302 |
+
|
303 |
+
# Supported index types
|
304 |
+
valid_index_types = ['emergency', 'treatment']
|
305 |
+
if index_type not in valid_index_types:
|
306 |
+
logger.error(f"Unsupported index type: {index_type}")
|
307 |
+
return False
|
308 |
+
|
309 |
+
try:
|
310 |
+
# Construct path to chunks file
|
311 |
+
chunks_path = os.path.join(self.embeddings_dir, f"{index_type}_chunks.json")
|
312 |
+
|
313 |
+
# Check file existence
|
314 |
+
if not os.path.exists(chunks_path):
|
315 |
+
logger.error(f"Index file not found: {chunks_path}")
|
316 |
+
return False
|
317 |
+
|
318 |
+
# Load chunks with error handling
|
319 |
+
with open(chunks_path, 'r', encoding='utf-8') as f:
|
320 |
+
chunks = json.load(f)
|
321 |
+
|
322 |
+
# Normalize keyword for flexible matching
|
323 |
+
keyword_lower = keyword.lower().strip()
|
324 |
+
|
325 |
+
# Advanced keyword matching
|
326 |
+
for chunk in chunks:
|
327 |
+
chunk_text = chunk.get('text', '').lower()
|
328 |
+
|
329 |
+
# Exact match
|
330 |
+
if keyword_lower in chunk_text:
|
331 |
+
logger.info(f"Exact match found for '{keyword}' in {index_type} index")
|
332 |
+
return True
|
333 |
+
|
334 |
+
# Partial match with word boundaries
|
335 |
+
if re.search(r'\b' + re.escape(keyword_lower) + r'\b', chunk_text):
|
336 |
+
logger.info(f"Partial match found for '{keyword}' in {index_type} index")
|
337 |
+
return True
|
338 |
+
|
339 |
+
# No match found
|
340 |
+
logger.info(f"No match found for '{keyword}' in {index_type} index")
|
341 |
+
return False
|
342 |
+
|
343 |
+
except json.JSONDecodeError:
|
344 |
+
logger.error(f"Invalid JSON in {chunks_path}")
|
345 |
+
return False
|
346 |
+
except IOError as e:
|
347 |
+
logger.error(f"IO error reading {chunks_path}: {e}")
|
348 |
+
return False
|
349 |
+
except Exception as e:
|
350 |
+
logger.error(f"Unexpected error in _check_keyword_in_index: {e}")
|
351 |
+
return False
|
352 |
+
|
353 |
+
def handle_user_confirmation(self, extracted_info: Dict[str, str]) -> Dict[str, Any]:
|
354 |
+
"""
|
355 |
+
Handle user confirmation for extracted condition and keywords
|
356 |
+
|
357 |
+
Args:
|
358 |
+
extracted_info: Dict with condition and keyword information
|
359 |
+
|
360 |
+
Returns:
|
361 |
+
Dict with confirmation status and options
|
362 |
+
"""
|
363 |
+
# If no condition found, request user to rephrase
|
364 |
+
if not extracted_info.get('condition'):
|
365 |
+
return {
|
366 |
+
'type': 'rephrase_needed',
|
367 |
+
'message': "Could not identify a specific medical condition. Please rephrase your query.",
|
368 |
+
'suggestions': [
|
369 |
+
"Try: 'how to treat chest pain'",
|
370 |
+
"Try: 'acute stroke management'",
|
371 |
+
"Try: 'pulmonary embolism treatment'"
|
372 |
+
]
|
373 |
+
}
|
374 |
+
|
375 |
+
# Prepare confirmation message
|
376 |
+
confirmation_message = f"""
|
377 |
+
I understand you're asking about: "{extracted_info.get('condition', 'Unknown Condition')}"
|
378 |
+
|
379 |
+
Extracted Keywords:
|
380 |
+
- Emergency: {extracted_info.get('emergency_keywords', 'None')}
|
381 |
+
- Treatment: {extracted_info.get('treatment_keywords', 'None')}
|
382 |
+
|
383 |
+
Please confirm:
|
384 |
+
1) Yes, proceed with search
|
385 |
+
2) No, please rephrase my query
|
386 |
+
3) Modify keywords
|
387 |
+
"""
|
388 |
+
|
389 |
+
return {
|
390 |
+
'type': 'confirmation_needed',
|
391 |
+
'message': confirmation_message,
|
392 |
+
'extracted_info': extracted_info
|
393 |
+
}
|
394 |
+
|
395 |
+
def _handle_matching_failure_level1(self, condition: str) -> Optional[Dict[str, Any]]:
|
396 |
+
"""
|
397 |
+
Level 1 Fallback: Loose keyword matching for medical conditions
|
398 |
+
|
399 |
+
Args:
|
400 |
+
condition: The condition to match loosely
|
401 |
+
|
402 |
+
Returns:
|
403 |
+
Dict with matched keywords or None
|
404 |
+
"""
|
405 |
+
# Predefined loose matching keywords for different medical domains
|
406 |
+
loose_medical_keywords = {
|
407 |
+
'emergency': [
|
408 |
+
'urgent', 'critical', 'severe', 'acute',
|
409 |
+
'immediate', 'life-threatening', 'emergency'
|
410 |
+
],
|
411 |
+
'treatment': [
|
412 |
+
'manage', 'cure', 'heal', 'recover',
|
413 |
+
'therapy', 'medication', 'intervention'
|
414 |
+
]
|
415 |
+
}
|
416 |
+
|
417 |
+
# Normalize condition
|
418 |
+
condition_lower = condition.lower().strip()
|
419 |
+
|
420 |
+
# Check emergency keywords
|
421 |
+
emergency_matches = [
|
422 |
+
kw for kw in loose_medical_keywords['emergency']
|
423 |
+
if kw in condition_lower
|
424 |
+
]
|
425 |
+
|
426 |
+
# Check treatment keywords
|
427 |
+
treatment_matches = [
|
428 |
+
kw for kw in loose_medical_keywords['treatment']
|
429 |
+
if kw in condition_lower
|
430 |
+
]
|
431 |
+
|
432 |
+
# If matches found, return result
|
433 |
+
if emergency_matches or treatment_matches:
|
434 |
+
logger.info(f"Loose keyword match for condition: {condition}")
|
435 |
+
return {
|
436 |
+
'type': 'loose_keyword_match',
|
437 |
+
'condition': condition,
|
438 |
+
'emergency_keywords': '|'.join(emergency_matches),
|
439 |
+
'treatment_keywords': '|'.join(treatment_matches),
|
440 |
+
'confidence': 0.5 # Lower confidence due to loose matching
|
441 |
+
}
|
442 |
+
|
443 |
+
# No loose matches found
|
444 |
+
logger.info(f"No loose keyword match for condition: {condition}")
|
445 |
+
return None
|
446 |
+
|
447 |
+
def validate_medical_query(self, user_query: str) -> Dict[str, Any]:
|
448 |
+
"""
|
449 |
+
Validate if the query is a medical-related query using Llama3-Med42-70B multi-layer verification
|
450 |
+
|
451 |
+
Args:
|
452 |
+
user_query: User's input query
|
453 |
+
|
454 |
+
Returns:
|
455 |
+
Dict with validation result or None if medical query
|
456 |
+
"""
|
457 |
+
# Expanded medical keywords covering comprehensive medical terminology
|
458 |
+
predefined_medical_keywords = {
|
459 |
+
# Symptoms and signs
|
460 |
+
'pain', 'symptom', 'ache', 'fever', 'inflammation',
|
461 |
+
'bleeding', 'swelling', 'rash', 'bruise', 'wound',
|
462 |
+
|
463 |
+
# Medical professional terms
|
464 |
+
'disease', 'condition', 'syndrome', 'disorder',
|
465 |
+
'medical', 'health', 'diagnosis', 'treatment',
|
466 |
+
'therapy', 'medication', 'prescription',
|
467 |
+
|
468 |
+
# Body systems and organs
|
469 |
+
'heart', 'lung', 'brain', 'kidney', 'liver',
|
470 |
+
'blood', 'nerve', 'muscle', 'bone', 'joint',
|
471 |
+
|
472 |
+
# Medical actions
|
473 |
+
'examine', 'check', 'test', 'scan', 'surgery',
|
474 |
+
'operation', 'emergency', 'urgent', 'critical',
|
475 |
+
|
476 |
+
# Specific medical fields
|
477 |
+
'cardiology', 'neurology', 'oncology', 'pediatrics',
|
478 |
+
'psychiatry', 'dermatology', 'orthopedics'
|
479 |
+
}
|
480 |
+
|
481 |
+
# Check if query contains predefined medical keywords
|
482 |
+
query_lower = user_query.lower()
|
483 |
+
if any(kw in query_lower for kw in predefined_medical_keywords):
|
484 |
+
return None # Validated by predefined keywords
|
485 |
+
|
486 |
+
try:
|
487 |
+
# Ensure Llama3-Med42-70B client is properly initialized
|
488 |
+
if not hasattr(self, 'llm_client') or self.llm_client is None:
|
489 |
+
self.logger.warning("Llama3-Med42-70B client not initialized")
|
490 |
+
return self._generate_invalid_query_response()
|
491 |
+
|
492 |
+
# Use Llama3-Med42-70B for final medical query determination
|
493 |
+
llama_result = self.llm_client.analyze_medical_query(
|
494 |
+
query=user_query,
|
495 |
+
max_tokens=100 # Limit tokens for efficiency
|
496 |
+
)
|
497 |
+
|
498 |
+
# If Llama3-Med42-70B successfully extracts a medical condition
|
499 |
+
if llama_result.get('extracted_condition'):
|
500 |
+
return None # Validated by Llama3-Med42-70B
|
501 |
+
|
502 |
+
except Exception as e:
|
503 |
+
# Log Llama3-Med42-70B analysis failure without blocking the process
|
504 |
+
self.logger.warning(f"Llama3-Med42-70B query validation failed: {e}")
|
505 |
+
|
506 |
+
# If no medical relevance is found
|
507 |
+
return self._generate_invalid_query_response()
|
508 |
+
|
509 |
+
def _generate_invalid_query_response(self) -> Dict[str, Any]:
|
510 |
+
"""
|
511 |
+
Generate response for non-medical queries
|
512 |
+
|
513 |
+
Returns:
|
514 |
+
Dict with invalid query guidance
|
515 |
+
"""
|
516 |
+
return {
|
517 |
+
'type': 'invalid_query',
|
518 |
+
'message': "This is OnCall.AI, a clinical medical assistance platform. "
|
519 |
+
"Please input a medical problem you need help resolving. "
|
520 |
+
"\n\nExamples:\n"
|
521 |
+
"- 'I'm experiencing chest pain'\n"
|
522 |
+
"- 'What are symptoms of stroke?'\n"
|
523 |
+
"- 'How to manage acute asthma?'\n"
|
524 |
+
"- 'I have a persistent headache'"
|
525 |
+
}
|
526 |
+
|
527 |
+
def main():
|
528 |
+
"""
|
529 |
+
Example usage and testing of UserPromptProcessor with Llama3-Med42-70B
|
530 |
+
Demonstrates condition extraction and query validation
|
531 |
+
"""
|
532 |
+
from .retrieval import BasicRetrievalSystem
|
533 |
+
|
534 |
+
# use relative import to avoid circular import
|
535 |
+
from .llm_clients import llm_Med42_70BClient
|
536 |
+
|
537 |
+
# Initialize LLM client
|
538 |
+
llm_client = llm_Med42_70BClient()
|
539 |
+
retrieval_system = BasicRetrievalSystem()
|
540 |
+
|
541 |
+
# Initialize UserPromptProcessor with the LLM client
|
542 |
+
processor = UserPromptProcessor(
|
543 |
+
llm_client=llm_client, retrieval_system=retrieval_system
|
544 |
+
)
|
545 |
+
|
546 |
+
# Update test cases with more representative medical queries
|
547 |
+
test_queries = [
|
548 |
+
"patient with severe chest pain and shortness of breath",
|
549 |
+
"sudden neurological symptoms suggesting stroke",
|
550 |
+
"persistent headache with vision changes"
|
551 |
+
]
|
552 |
+
|
553 |
+
for query in test_queries:
|
554 |
+
print(f"\nQuery: {query}")
|
555 |
+
result = processor.extract_condition_keywords(query)
|
556 |
+
print("Extracted Keywords:", result)
|
557 |
+
|
558 |
+
confirmation = processor.handle_user_confirmation(result)
|
559 |
+
print("Confirmation:", confirmation['message'])
|
560 |
+
|
561 |
+
if __name__ == "__main__":
|
562 |
+
main()
|
test_retrieval_pipeline.py
ADDED
@@ -0,0 +1,223 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
"""
|
3 |
+
Test script for OnCall.ai retrieval pipeline
|
4 |
+
|
5 |
+
This script tests the complete flow:
|
6 |
+
user_input → user_prompt.py → retrieval.py
|
7 |
+
|
8 |
+
Author: OnCall.ai Team
|
9 |
+
Date: 2025-07-30
|
10 |
+
"""
|
11 |
+
|
12 |
+
import sys
|
13 |
+
import os
|
14 |
+
from pathlib import Path
|
15 |
+
import logging
|
16 |
+
import json
|
17 |
+
from datetime import datetime
|
18 |
+
|
19 |
+
# Add src directory to Python path
|
20 |
+
sys.path.append(os.path.join(os.path.dirname(__file__), 'src'))
|
21 |
+
|
22 |
+
# Import our modules
|
23 |
+
from user_prompt import UserPromptProcessor
|
24 |
+
from retrieval import BasicRetrievalSystem
|
25 |
+
from llm_clients import llm_Med42_70BClient
|
26 |
+
|
27 |
+
# Configure logging
|
28 |
+
logging.basicConfig(
|
29 |
+
level=logging.INFO,
|
30 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
31 |
+
handlers=[
|
32 |
+
logging.StreamHandler(),
|
33 |
+
logging.FileHandler('test_retrieval_pipeline.log')
|
34 |
+
]
|
35 |
+
)
|
36 |
+
logger = logging.getLogger(__name__)
|
37 |
+
|
38 |
+
def test_retrieval_pipeline():
|
39 |
+
"""
|
40 |
+
Test the complete retrieval pipeline
|
41 |
+
"""
|
42 |
+
print("="*60)
|
43 |
+
print("OnCall.ai Retrieval Pipeline Test")
|
44 |
+
print("="*60)
|
45 |
+
print(f"Test started at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
46 |
+
print()
|
47 |
+
|
48 |
+
try:
|
49 |
+
# Initialize components
|
50 |
+
print("🔧 Initializing components...")
|
51 |
+
|
52 |
+
# Initialize LLM client
|
53 |
+
llm_client = llm_Med42_70BClient()
|
54 |
+
print("✅ LLM client initialized")
|
55 |
+
|
56 |
+
# Initialize retrieval system
|
57 |
+
retrieval_system = BasicRetrievalSystem()
|
58 |
+
print("✅ Retrieval system initialized")
|
59 |
+
|
60 |
+
# Initialize user prompt processor
|
61 |
+
user_prompt_processor = UserPromptProcessor(
|
62 |
+
llm_client=llm_client,
|
63 |
+
retrieval_system=retrieval_system
|
64 |
+
)
|
65 |
+
print("✅ User prompt processor initialized")
|
66 |
+
print()
|
67 |
+
|
68 |
+
# Test queries
|
69 |
+
test_queries = [
|
70 |
+
"how to treat acute MI?",
|
71 |
+
"patient with chest pain and shortness of breath",
|
72 |
+
"sudden neurological symptoms suggesting stroke",
|
73 |
+
"acute stroke management protocol"
|
74 |
+
]
|
75 |
+
|
76 |
+
results = []
|
77 |
+
|
78 |
+
for i, query in enumerate(test_queries, 1):
|
79 |
+
print(f"🔍 Test {i}/{len(test_queries)}: Testing query: '{query}'")
|
80 |
+
print("-" * 50)
|
81 |
+
|
82 |
+
try:
|
83 |
+
# Step 1: Extract condition keywords
|
84 |
+
print("Step 1: Extracting condition keywords...")
|
85 |
+
condition_result = user_prompt_processor.extract_condition_keywords(query)
|
86 |
+
|
87 |
+
print(f" Condition: {condition_result.get('condition', 'None')}")
|
88 |
+
print(f" Emergency keywords: {condition_result.get('emergency_keywords', 'None')}")
|
89 |
+
print(f" Treatment keywords: {condition_result.get('treatment_keywords', 'None')}")
|
90 |
+
|
91 |
+
if not condition_result.get('condition'):
|
92 |
+
print(" ⚠️ No condition extracted, skipping retrieval")
|
93 |
+
continue
|
94 |
+
|
95 |
+
# Step 2: User confirmation (simulated)
|
96 |
+
print("\nStep 2: User confirmation (simulated as 'yes')")
|
97 |
+
confirmation = user_prompt_processor.handle_user_confirmation(condition_result)
|
98 |
+
print(f" Confirmation type: {confirmation.get('type', 'Unknown')}")
|
99 |
+
|
100 |
+
# Step 3: Perform retrieval
|
101 |
+
print("\nStep 3: Performing retrieval...")
|
102 |
+
search_query = f"{condition_result.get('emergency_keywords', '')} {condition_result.get('treatment_keywords', '')}".strip()
|
103 |
+
|
104 |
+
if not search_query:
|
105 |
+
search_query = condition_result.get('condition', query)
|
106 |
+
|
107 |
+
print(f" Search query: '{search_query}'")
|
108 |
+
|
109 |
+
retrieval_results = retrieval_system.search(search_query, top_k=5)
|
110 |
+
|
111 |
+
# Display results
|
112 |
+
print(f"\n📊 Retrieval Results:")
|
113 |
+
print(f" Total results: {retrieval_results.get('total_results', 0)}")
|
114 |
+
|
115 |
+
emergency_results = retrieval_results.get('emergency_results', [])
|
116 |
+
treatment_results = retrieval_results.get('treatment_results', [])
|
117 |
+
|
118 |
+
print(f" Emergency results: {len(emergency_results)}")
|
119 |
+
print(f" Treatment results: {len(treatment_results)}")
|
120 |
+
|
121 |
+
# Show top results
|
122 |
+
if 'processed_results' in retrieval_results:
|
123 |
+
processed_results = retrieval_results['processed_results'][:3] # Show top 3
|
124 |
+
print(f"\n Top {len(processed_results)} results:")
|
125 |
+
for j, result in enumerate(processed_results, 1):
|
126 |
+
print(f" {j}. Type: {result.get('type', 'Unknown')}")
|
127 |
+
print(f" Distance: {result.get('distance', 'Unknown'):.4f}")
|
128 |
+
print(f" Text preview: {result.get('text', '')[:100]}...")
|
129 |
+
print(f" Matched: {result.get('matched', 'None')}")
|
130 |
+
print(f" Treatment matched: {result.get('matched_treatment', 'None')}")
|
131 |
+
print()
|
132 |
+
|
133 |
+
# Store results for summary
|
134 |
+
test_result = {
|
135 |
+
'query': query,
|
136 |
+
'condition_extracted': condition_result.get('condition', ''),
|
137 |
+
'emergency_keywords': condition_result.get('emergency_keywords', ''),
|
138 |
+
'treatment_keywords': condition_result.get('treatment_keywords', ''),
|
139 |
+
'search_query': search_query,
|
140 |
+
'total_results': retrieval_results.get('total_results', 0),
|
141 |
+
'emergency_count': len(emergency_results),
|
142 |
+
'treatment_count': len(treatment_results),
|
143 |
+
'success': True
|
144 |
+
}
|
145 |
+
results.append(test_result)
|
146 |
+
|
147 |
+
print("✅ Test completed successfully")
|
148 |
+
|
149 |
+
except Exception as e:
|
150 |
+
logger.error(f"Error in test {i}: {e}", exc_info=True)
|
151 |
+
test_result = {
|
152 |
+
'query': query,
|
153 |
+
'error': str(e),
|
154 |
+
'success': False
|
155 |
+
}
|
156 |
+
results.append(test_result)
|
157 |
+
print(f"❌ Test failed: {e}")
|
158 |
+
|
159 |
+
print("\n" + "="*60 + "\n")
|
160 |
+
|
161 |
+
# Print summary
|
162 |
+
print_test_summary(results)
|
163 |
+
|
164 |
+
# Save results to file
|
165 |
+
save_test_results(results)
|
166 |
+
|
167 |
+
return results
|
168 |
+
|
169 |
+
except Exception as e:
|
170 |
+
logger.error(f"Critical error in pipeline test: {e}", exc_info=True)
|
171 |
+
print(f"❌ Critical error: {e}")
|
172 |
+
return []
|
173 |
+
|
174 |
+
def print_test_summary(results):
|
175 |
+
"""Print test summary"""
|
176 |
+
print("📋 TEST SUMMARY")
|
177 |
+
print("="*60)
|
178 |
+
|
179 |
+
successful_tests = [r for r in results if r.get('success', False)]
|
180 |
+
failed_tests = [r for r in results if not r.get('success', False)]
|
181 |
+
|
182 |
+
print(f"Total tests: {len(results)}")
|
183 |
+
print(f"Successful: {len(successful_tests)}")
|
184 |
+
print(f"Failed: {len(failed_tests)}")
|
185 |
+
print(f"Success rate: {len(successful_tests)/len(results)*100:.1f}%")
|
186 |
+
print()
|
187 |
+
|
188 |
+
if successful_tests:
|
189 |
+
print("✅ Successful tests:")
|
190 |
+
for result in successful_tests:
|
191 |
+
print(f" - '{result['query']}'")
|
192 |
+
print(f" Condition: {result.get('condition_extracted', 'None')}")
|
193 |
+
print(f" Results: {result.get('total_results', 0)} total "
|
194 |
+
f"({result.get('emergency_count', 0)} emergency, "
|
195 |
+
f"{result.get('treatment_count', 0)} treatment)")
|
196 |
+
print()
|
197 |
+
|
198 |
+
if failed_tests:
|
199 |
+
print("❌ Failed tests:")
|
200 |
+
for result in failed_tests:
|
201 |
+
print(f" - '{result['query']}': {result.get('error', 'Unknown error')}")
|
202 |
+
print()
|
203 |
+
|
204 |
+
def save_test_results(results):
|
205 |
+
"""Save test results to JSON file"""
|
206 |
+
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
|
207 |
+
filename = f"test_results_{timestamp}.json"
|
208 |
+
|
209 |
+
try:
|
210 |
+
with open(filename, 'w', encoding='utf-8') as f:
|
211 |
+
json.dump({
|
212 |
+
'timestamp': datetime.now().isoformat(),
|
213 |
+
'test_results': results
|
214 |
+
}, f, indent=2, ensure_ascii=False)
|
215 |
+
|
216 |
+
print(f"📁 Test results saved to: {filename}")
|
217 |
+
|
218 |
+
except Exception as e:
|
219 |
+
logger.error(f"Failed to save test results: {e}")
|
220 |
+
print(f"⚠️ Failed to save test results: {e}")
|
221 |
+
|
222 |
+
if __name__ == "__main__":
|
223 |
+
test_retrieval_pipeline()
|
tests/requirements.txt
ADDED
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
aiofiles==24.1.0
|
2 |
+
aiohappyeyeballs==2.6.1
|
3 |
+
aiohttp==3.12.14
|
4 |
+
aiosignal==1.4.0
|
5 |
+
annotated-types==0.7.0
|
6 |
+
annoy==1.17.3
|
7 |
+
anyio==4.9.0
|
8 |
+
attrs==25.3.0
|
9 |
+
Brotli==1.1.0
|
10 |
+
certifi==2025.7.14
|
11 |
+
charset-normalizer==3.4.2
|
12 |
+
click==8.2.1
|
13 |
+
contourpy==1.3.2
|
14 |
+
cycler==0.12.1
|
15 |
+
datasets==4.0.0
|
16 |
+
dill==0.3.8
|
17 |
+
distro==1.9.0
|
18 |
+
fastapi==0.116.1
|
19 |
+
ffmpy==0.6.0
|
20 |
+
filelock==3.18.0
|
21 |
+
fonttools==4.59.0
|
22 |
+
frozenlist==1.7.0
|
23 |
+
fsspec==2025.3.0
|
24 |
+
gradio==5.38.0
|
25 |
+
gradio_client==1.11.0
|
26 |
+
groovy==0.1.2
|
27 |
+
h11==0.16.0
|
28 |
+
hf-xet==1.1.5
|
29 |
+
httpcore==1.0.9
|
30 |
+
httpx==0.28.1
|
31 |
+
huggingface-hub==0.33.4
|
32 |
+
idna==3.10
|
33 |
+
iniconfig==2.1.0
|
34 |
+
Jinja2==3.1.6
|
35 |
+
jiter==0.10.0
|
36 |
+
joblib==1.5.1
|
37 |
+
kiwisolver==1.4.8
|
38 |
+
markdown-it-py==3.0.0
|
39 |
+
MarkupSafe==3.0.2
|
40 |
+
matplotlib==3.10.3
|
41 |
+
mdurl==0.1.2
|
42 |
+
mpmath==1.3.0
|
43 |
+
multidict==6.6.3
|
44 |
+
multiprocess==0.70.16
|
45 |
+
networkx==3.5
|
46 |
+
numpy==2.3.1
|
47 |
+
openai==1.97.0
|
48 |
+
orjson==3.11.0
|
49 |
+
packaging==25.0
|
50 |
+
pandas==2.3.1
|
51 |
+
pillow==11.3.0
|
52 |
+
pluggy==1.6.0
|
53 |
+
propcache==0.3.2
|
54 |
+
pyarrow==20.0.0
|
55 |
+
pydantic==2.11.7
|
56 |
+
pydantic_core==2.33.2
|
57 |
+
pydub==0.25.1
|
58 |
+
Pygments==2.19.2
|
59 |
+
pyparsing==3.2.3
|
60 |
+
pytest==8.4.1
|
61 |
+
python-dateutil==2.9.0.post0
|
62 |
+
python-multipart==0.0.20
|
63 |
+
pytz==2025.2
|
64 |
+
PyYAML==6.0.2
|
65 |
+
regex==2024.11.6
|
66 |
+
requests==2.32.4
|
67 |
+
rich==14.0.0
|
68 |
+
ruff==0.12.4
|
69 |
+
safehttpx==0.1.6
|
70 |
+
safetensors==0.5.3
|
71 |
+
scikit-learn==1.7.1
|
72 |
+
scipy==1.16.1
|
73 |
+
seaborn==0.13.2
|
74 |
+
semantic-version==2.10.0
|
75 |
+
sentence-transformers==3.0.1
|
76 |
+
shellingham==1.5.4
|
77 |
+
six==1.17.0
|
78 |
+
sniffio==1.3.1
|
79 |
+
starlette==0.47.2
|
80 |
+
sympy==1.14.0
|
81 |
+
threadpoolctl==3.6.0
|
82 |
+
tokenizers==0.21.2
|
83 |
+
tomlkit==0.13.3
|
84 |
+
torch==2.7.1
|
85 |
+
tqdm==4.67.1
|
86 |
+
transformers==4.53.2
|
87 |
+
typer==0.16.0
|
88 |
+
typing-inspection==0.4.1
|
89 |
+
typing_extensions==4.14.1
|
90 |
+
tzdata==2025.2
|
91 |
+
urllib3==2.5.0
|
92 |
+
uvicorn==0.35.0
|
93 |
+
websockets==15.0.1
|
94 |
+
xxhash==3.5.0
|
95 |
+
yarl==1.20.1
|
tests/result_of_test_end_to_end_pipeline.md
ADDED
The diff for this file is too large to render.
See raw diff
|
|
tests/result_of_test_multlevel_fallback_validation.md
ADDED
@@ -0,0 +1,570 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
🏥 OnCall.ai Multilevel Fallback Validation Test
|
2 |
+
============================================================
|
3 |
+
🔧 Initializing Components for Multilevel Fallback Test...
|
4 |
+
------------------------------------------------------------
|
5 |
+
1. Initializing Llama3-Med42-70B Client...
|
6 |
+
2025-07-31 07:12:17,625 - llm_clients - INFO - Medical LLM client initialized with model: m42-health/Llama3-Med42-70B
|
7 |
+
2025-07-31 07:12:17,626 - llm_clients - WARNING - Medical LLM Model: Research tool only. Not for professional medical diagnosis.
|
8 |
+
✅ LLM client initialized
|
9 |
+
2. Initializing Retrieval System...
|
10 |
+
2025-07-31 07:12:17,626 - retrieval - INFO - Initializing retrieval system...
|
11 |
+
2025-07-31 07:12:17,637 - sentence_transformers.SentenceTransformer - INFO - Use pytorch device_name: mps
|
12 |
+
2025-07-31 07:12:17,637 - sentence_transformers.SentenceTransformer - INFO - Load pretrained SentenceTransformer: NeuML/pubmedbert-base-embeddings
|
13 |
+
2025-07-31 07:12:20,936 - retrieval - INFO - Embedding model loaded successfully
|
14 |
+
2025-07-31 07:12:22,314 - retrieval - INFO - Chunks loaded successfully
|
15 |
+
2025-07-31 07:12:22,418 - retrieval - INFO - Embeddings loaded successfully
|
16 |
+
2025-07-31 07:12:22,419 - retrieval - INFO - Loaded existing emergency index
|
17 |
+
2025-07-31 07:12:22,420 - retrieval - INFO - Loaded existing treatment index
|
18 |
+
2025-07-31 07:12:22,420 - retrieval - INFO - Retrieval system initialized successfully
|
19 |
+
✅ Retrieval system initialized
|
20 |
+
3. Initializing User Prompt Processor...
|
21 |
+
2025-07-31 07:12:22,420 - sentence_transformers.SentenceTransformer - INFO - Use pytorch device_name: mps
|
22 |
+
2025-07-31 07:12:22,420 - sentence_transformers.SentenceTransformer - INFO - Load pretrained SentenceTransformer: NeuML/pubmedbert-base-embeddings
|
23 |
+
2025-07-31 07:12:24,622 - user_prompt - INFO - UserPromptProcessor initialized
|
24 |
+
✅ User prompt processor initialized
|
25 |
+
|
26 |
+
🎉 All components initialized successfully!
|
27 |
+
|
28 |
+
🚀 Starting Multilevel Fallback Test Suite
|
29 |
+
Total test cases: 13
|
30 |
+
Test started at: 2025-07-31 07:12:17
|
31 |
+
================================================================================
|
32 |
+
|
33 |
+
🔍 level1_001: Level 1: Direct predefined condition match
|
34 |
+
Query: 'acute myocardial infarction treatment'
|
35 |
+
Expected Level: 1
|
36 |
+
----------------------------------------------------------------------
|
37 |
+
🎯 Executing multilevel fallback...
|
38 |
+
2025-07-31 07:12:24,623 - user_prompt - INFO - Matched predefined condition: acute myocardial infarction
|
39 |
+
✅ Detected Level: 1
|
40 |
+
Condition: acute myocardial infarction
|
41 |
+
Emergency Keywords: MI|chest pain|cardiac arrest
|
42 |
+
Treatment Keywords: aspirin|nitroglycerin|thrombolytic|PCI
|
43 |
+
Execution Time: 0.000s
|
44 |
+
🎉 Test PASSED - Expected behavior achieved
|
45 |
+
|
46 |
+
🔍 level1_002: Level 1: Predefined stroke condition
|
47 |
+
Query: 'how to manage acute stroke?'
|
48 |
+
Expected Level: 1
|
49 |
+
----------------------------------------------------------------------
|
50 |
+
🎯 Executing multilevel fallback...
|
51 |
+
2025-07-31 07:12:24,623 - user_prompt - INFO - Matched predefined condition: acute stroke
|
52 |
+
✅ Detected Level: 1
|
53 |
+
Condition: acute stroke
|
54 |
+
Emergency Keywords: stroke|neurological deficit|sudden weakness
|
55 |
+
Treatment Keywords: tPA|thrombolysis|stroke unit care
|
56 |
+
Execution Time: 0.000s
|
57 |
+
🎉 Test PASSED - Expected behavior achieved
|
58 |
+
|
59 |
+
🔍 level1_003: Level 1: Predefined PE condition
|
60 |
+
Query: 'pulmonary embolism emergency protocol'
|
61 |
+
Expected Level: 1
|
62 |
+
----------------------------------------------------------------------
|
63 |
+
🎯 Executing multilevel fallback...
|
64 |
+
2025-07-31 07:12:24,623 - user_prompt - INFO - Matched predefined condition: pulmonary embolism
|
65 |
+
✅ Detected Level: 1
|
66 |
+
Condition: pulmonary embolism
|
67 |
+
Emergency Keywords: chest pain|shortness of breath|sudden dyspnea
|
68 |
+
Treatment Keywords: anticoagulation|heparin|embolectomy
|
69 |
+
Execution Time: 0.000s
|
70 |
+
🎉 Test PASSED - Expected behavior achieved
|
71 |
+
|
72 |
+
🔍 level2_001: Level 2: Symptom-based query requiring LLM analysis
|
73 |
+
Query: 'patient with severe crushing chest pain radiating to left arm'
|
74 |
+
Expected Level: 2
|
75 |
+
----------------------------------------------------------------------
|
76 |
+
🎯 Executing multilevel fallback...
|
77 |
+
2025-07-31 07:12:24,623 - llm_clients - INFO - Calling Medical LLM with query: patient with severe crushing chest pain radiating to left arm
|
78 |
+
2025-07-31 07:12:47,629 - llm_clients - INFO - Raw LLM Response: Acute Myocardial Infarction (STEMI) - considering "severe crushing chest pain" and radiation to the left arm, which are classic symptoms of a heart attack specifically involving ST-elevation (STEMI type), indicating complete blockage of a coronary artery. However, please note that as an AI assistant, I don't diagnose; this interpretation is based on common clinical presentation. A healthcare provider should perform an ECG and other tests for confirmation.
|
79 |
+
2025-07-31 07:12:47,630 - llm_clients - INFO - Query Latency: 23.0064 seconds
|
80 |
+
2025-07-31 07:12:47,630 - llm_clients - INFO - Extracted Condition: acute myocardial infarction
|
81 |
+
✅ Detected Level: 1
|
82 |
+
Condition: acute myocardial infarction
|
83 |
+
Emergency Keywords: MI|chest pain|cardiac arrest
|
84 |
+
Treatment Keywords: aspirin|nitroglycerin|thrombolytic|PCI
|
85 |
+
Execution Time: 23.008s
|
86 |
+
🎉 Test PASSED - Expected behavior achieved
|
87 |
+
|
88 |
+
🔍 level2_002: Level 2: Neurological symptoms requiring LLM
|
89 |
+
Query: 'sudden onset weakness on right side with speech difficulty'
|
90 |
+
Expected Level: 2
|
91 |
+
----------------------------------------------------------------------
|
92 |
+
🎯 Executing multilevel fallback...
|
93 |
+
2025-07-31 07:12:47,631 - llm_clients - INFO - Calling Medical LLM with query: sudden onset weakness on right side with speech difficulty
|
94 |
+
2025-07-31 07:12:56,760 - llm_clients - INFO - Raw LLM Response: Cerebrovascular Accident (CVA), or Acute Ischemic Stroke (specifically, with right hemiparesis and aphasia)
|
95 |
+
|
96 |
+
- This diagnosis represents the most likely condition given the sudden onset of right-sided weakness (hemiparesis) and speech difficulty (aphasia). An ischemic stroke occurs when blood flow to a part of the brain is blocked, typically by a thrombus or embolus, causing damage to brain tissue and resulting in neurological deficits. Immediate medical
|
97 |
+
2025-07-31 07:12:56,760 - llm_clients - INFO - Query Latency: 9.1288 seconds
|
98 |
+
2025-07-31 07:12:56,760 - llm_clients - INFO - Extracted Condition: Cerebrovascular Accident (CVA), or Acute Ischemic Stroke (specifically, with right hemiparesis and aphasia)
|
99 |
+
2025-07-31 07:12:56,760 - user_prompt - INFO - Starting semantic search fallback for query: 'sudden onset weakness on right side with speech difficulty'
|
100 |
+
Batches: 100%|██████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 1.66it/s]
|
101 |
+
2025-07-31 07:12:58,013 - retrieval - INFO - Sliding window search: Found 5 results
|
102 |
+
2025-07-31 07:12:58,023 - user_prompt - INFO - Semantic search returned 5 results
|
103 |
+
Batches: 100%|██████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 13.88it/s]
|
104 |
+
Batches: 100%|██████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 17.77it/s]
|
105 |
+
Batches: 100%|██████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 17.88it/s]
|
106 |
+
Batches: 100%|██████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 62.68it/s]
|
107 |
+
Batches: 100%|██████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 17.51it/s]
|
108 |
+
Batches: 100%|██████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 57.08it/s]
|
109 |
+
Batches: 100%|██████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 60.75it/s]
|
110 |
+
Batches: 100%|██████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 63.98it/s]
|
111 |
+
2025-07-31 07:12:58,342 - user_prompt - INFO - Inferred condition: None
|
112 |
+
2025-07-31 07:12:58,342 - user_prompt - WARNING - Condition validation failed for: None
|
113 |
+
2025-07-31 07:12:58,342 - user_prompt - INFO - No suitable condition found in semantic search
|
114 |
+
2025-07-31 07:12:58,342 - llm_clients - INFO - Calling Medical LLM with query: sudden onset weakness on right side with speech difficulty
|
115 |
+
2025-07-31 07:13:09,255 - llm_clients - INFO - Raw LLM Response: Cerebrovascular Accident (CVA), or Acute Ischemic Stroke (specifically, with right hemiparesis and aphasia)
|
116 |
+
|
117 |
+
- This diagnosis represents the most likely condition given the sudden onset of right-sided weakness (hemiparesis) and speech difficulty (aphasia), which are classic symptoms of an ischemic stroke affecting the dominant hemisphere (assuming the patient is right-handed).
|
118 |
+
|
119 |
+
Please note that only a qualified physician can confirm a diagnosis after a thorough evaluation, including imaging studies
|
120 |
+
2025-07-31 07:13:09,255 - llm_clients - INFO - Query Latency: 10.9129 seconds
|
121 |
+
2025-07-31 07:13:09,255 - llm_clients - INFO - Extracted Condition: Cerebrovascular Accident (CVA), or Acute Ischemic Stroke (specifically, with right hemiparesis and aphasia)
|
122 |
+
Batches: 100%|██████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 8.55it/s]
|
123 |
+
2025-07-31 07:13:09,844 - retrieval - INFO - Sliding window search: Found 5 results
|
124 |
+
✅ Detected Level: 5
|
125 |
+
Condition: generic medical query
|
126 |
+
Emergency Keywords: medical|emergency
|
127 |
+
Treatment Keywords: treatment|management
|
128 |
+
Execution Time: 22.223s
|
129 |
+
⚠️ Test PARTIAL - ⚠️ Level 5 != expected 2. ⚠️ Condition 'generic medical query' != expected ['acute stroke', 'cerebrovascular accident'].
|
130 |
+
|
131 |
+
🔍 level3_001: Level 3: Generic medical terms requiring semantic search
|
132 |
+
Query: 'emergency management of cardiovascular crisis'
|
133 |
+
Expected Level: 3
|
134 |
+
----------------------------------------------------------------------
|
135 |
+
🎯 Executing multilevel fallback...
|
136 |
+
2025-07-31 07:13:09,854 - llm_clients - INFO - Calling Medical LLM with query: emergency management of cardiovascular crisis
|
137 |
+
2025-07-31 07:13:20,094 - llm_clients - INFO - Raw LLM Response: Cardiac Arrest (or, in context of crisis not yet arrest: Acute Cardiogenic Emergency, e.g., STEMI)
|
138 |
+
|
139 |
+
- Note: As a text-based AI assistant, not a clinician, I don't provide medical advice. The term given here represents the most critical cardiovascular crisis requiring immediate emergency intervention. Cardiac arrest implies the heart has stopped pumping, while acute cardiogenic emergency (e.g., ST-elevation myocardial infarction, or STEMI) signifies severe heart
|
140 |
+
2025-07-31 07:13:20,095 - llm_clients - INFO - Query Latency: 10.2402 seconds
|
141 |
+
2025-07-31 07:13:20,095 - llm_clients - INFO - Extracted Condition: Cardiac Arrest (or, in context of crisis not yet arrest: Acute Cardiogenic Emergency, e.g., STEMI)
|
142 |
+
2025-07-31 07:13:20,095 - user_prompt - INFO - Starting semantic search fallback for query: 'emergency management of cardiovascular crisis'
|
143 |
+
Batches: 100%|██████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 15.11it/s]
|
144 |
+
2025-07-31 07:13:20,681 - retrieval - INFO - Sliding window search: Found 5 results
|
145 |
+
2025-07-31 07:13:20,713 - user_prompt - INFO - Semantic search returned 5 results
|
146 |
+
Batches: 100%|██████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 14.75it/s]
|
147 |
+
Batches: 100%|██████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 59.28it/s]
|
148 |
+
Batches: 100%|██████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 56.29it/s]
|
149 |
+
Batches: 100%|██████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 62.79it/s]
|
150 |
+
Batches: 100%|██████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 65.12it/s]
|
151 |
+
Batches: 100%|██████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 62.44it/s]
|
152 |
+
Batches: 100%|██████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 61.88it/s]
|
153 |
+
Batches: 100%|██████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 62.20it/s]
|
154 |
+
2025-07-31 07:13:20,905 - user_prompt - INFO - Inferred condition: None
|
155 |
+
2025-07-31 07:13:20,905 - user_prompt - WARNING - Condition validation failed for: None
|
156 |
+
2025-07-31 07:13:20,905 - user_prompt - INFO - No suitable condition found in semantic search
|
157 |
+
Batches: 100%|██████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 15.96it/s]
|
158 |
+
2025-07-31 07:13:21,492 - retrieval - INFO - Sliding window search: Found 5 results
|
159 |
+
✅ Detected Level: 5
|
160 |
+
Condition: generic medical query
|
161 |
+
Emergency Keywords: medical|emergency
|
162 |
+
Treatment Keywords: treatment|management
|
163 |
+
Execution Time: 11.647s
|
164 |
+
⚠️ Test PARTIAL - ⚠️ Level 5 != expected 3. ⚠️ Condition 'generic medical query' != expected [].
|
165 |
+
|
166 |
+
🔍 level3_002: Level 3: Medical terminology requiring semantic fallback
|
167 |
+
Query: 'urgent neurological intervention protocols'
|
168 |
+
Expected Level: 3
|
169 |
+
----------------------------------------------------------------------
|
170 |
+
🎯 Executing multilevel fallback...
|
171 |
+
2025-07-31 07:13:21,501 - llm_clients - INFO - Calling Medical LLM with query: urgent neurological intervention protocols
|
172 |
+
2025-07-31 07:13:30,536 - llm_clients - INFO - Raw LLM Response: The most representative condition: Acute Ischemic Stroke (requiring urgent neurointervention, such as thrombectomy)
|
173 |
+
|
174 |
+
Explanation: The phrase "urgent neurological intervention protocols" typically refers to time-critical situations in neurology, and among these, acute ischemic stroke is a prime example. Acute ischemic stroke necessitates rapid evaluation and intervention, including thrombectomy, to restore blood flow and minimize brain damage. This condition demands urgent action due to its narrow therapeutic window, typically within
|
175 |
+
2025-07-31 07:13:30,537 - llm_clients - INFO - Query Latency: 9.0352 seconds
|
176 |
+
2025-07-31 07:13:30,537 - llm_clients - INFO - Extracted Condition: The most representative condition: Acute Ischemic Stroke (requiring urgent neurointervention, such as thrombectomy)
|
177 |
+
2025-07-31 07:13:30,537 - user_prompt - INFO - Starting semantic search fallback for query: 'urgent neurological intervention protocols'
|
178 |
+
Batches: 100%|██████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 7.94it/s]
|
179 |
+
2025-07-31 07:13:31,115 - retrieval - INFO - Sliding window search: Found 5 results
|
180 |
+
2025-07-31 07:13:31,123 - user_prompt - INFO - Semantic search returned 5 results
|
181 |
+
Batches: 100%|██████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 13.96it/s]
|
182 |
+
Batches: 100%|██████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 46.55it/s]
|
183 |
+
Batches: 100%|██████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 47.09it/s]
|
184 |
+
Batches: 100%|██████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 58.23it/s]
|
185 |
+
Batches: 100%|██████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 48.16it/s]
|
186 |
+
Batches: 100%|██████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 65.05it/s]
|
187 |
+
Batches: 100%|██████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 60.42it/s]
|
188 |
+
Batches: 100%|██████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 63.08it/s]
|
189 |
+
2025-07-31 07:13:31,334 - user_prompt - INFO - Inferred condition: None
|
190 |
+
2025-07-31 07:13:31,334 - user_prompt - WARNING - Condition validation failed for: None
|
191 |
+
2025-07-31 07:13:31,334 - user_prompt - INFO - No suitable condition found in semantic search
|
192 |
+
Batches: 100%|██████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 16.31it/s]
|
193 |
+
2025-07-31 07:13:31,889 - retrieval - INFO - Sliding window search: Found 5 results
|
194 |
+
✅ Detected Level: 5
|
195 |
+
Condition: generic medical query
|
196 |
+
Emergency Keywords: medical|emergency
|
197 |
+
Treatment Keywords: treatment|management
|
198 |
+
Execution Time: 10.398s
|
199 |
+
⚠️ Test PARTIAL - ⚠️ Level 5 != expected 3. ⚠️ Condition 'generic medical query' != expected [].
|
200 |
+
|
201 |
+
🔍 level4a_001: Level 4a: Non-medical query should be rejected
|
202 |
+
Query: 'how to cook pasta properly?'
|
203 |
+
Expected Level: 4
|
204 |
+
----------------------------------------------------------------------
|
205 |
+
🎯 Executing multilevel fallback...
|
206 |
+
2025-07-31 07:13:31,899 - llm_clients - INFO - Calling Medical LLM with query: how to cook pasta properly?
|
207 |
+
2025-07-31 07:13:41,038 - llm_clients - INFO - Raw LLM Response: As a medical assistant, I do not address cooking techniques, only medical conditions. However, for context (not advice): This query doesn't represent a medical condition; it's about culinary practice. In this case, "properly" cooking pasta typically means achieving al dente texture (not overly soft) by boiling in adequately salted water for the recommended time on the package, then draining well. This is unrelated to any health condition unless discussing, hypothetically, gastrointestinal tolerance in specific patients (e
|
208 |
+
2025-07-31 07:13:41,038 - llm_clients - INFO - Query Latency: 9.1386 seconds
|
209 |
+
2025-07-31 07:13:41,038 - llm_clients - INFO - Extracted Condition: As a medical assistant, I do not address cooking techniques, only medical conditions. However, for context (not advice): This query doesn't represent a medical condition; it's about culinary practice. In this case, "properly" cooking pasta typically means achieving al dente texture (not overly soft) by boiling in adequately salted water for the recommended time on the package, then draining well. This is unrelated to any health condition unless discussing, hypothetically, gastrointestinal tolerance in specific patients (e
|
210 |
+
2025-07-31 07:13:41,038 - user_prompt - INFO - Starting semantic search fallback for query: 'how to cook pasta properly?'
|
211 |
+
Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 2.02it/s]
|
212 |
+
2025-07-31 07:13:42,156 - retrieval - INFO - Sliding window search: Found 5 results
|
213 |
+
2025-07-31 07:13:42,165 - user_prompt - INFO - Semantic search returned 5 results
|
214 |
+
Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 9.34it/s]
|
215 |
+
Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 52.88it/s]
|
216 |
+
Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 55.97it/s]
|
217 |
+
Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 55.95it/s]
|
218 |
+
Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 54.63it/s]
|
219 |
+
Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 59.07it/s]
|
220 |
+
Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 57.84it/s]
|
221 |
+
Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 60.43it/s]
|
222 |
+
2025-07-31 07:13:42,407 - user_prompt - INFO - Inferred condition: None
|
223 |
+
2025-07-31 07:13:42,407 - user_prompt - WARNING - Condition validation failed for: None
|
224 |
+
2025-07-31 07:13:42,407 - user_prompt - INFO - No suitable condition found in semantic search
|
225 |
+
2025-07-31 07:13:42,407 - llm_clients - INFO - Calling Medical LLM with query: how to cook pasta properly?
|
226 |
+
2025-07-31 07:13:51,634 - llm_clients - INFO - Raw LLM Response: As a medical assistant, I don't address cooking techniques, but for context (not medical advice): In terms of relevance to health, the key aspect here isn't "proper" cooking per se, but rather avoiding overcooking that can reduce nutrient content. For whole-grain pasta, aim for al dente texture (firm, not mushy) to preserve fiber and other nutrients. However, this query is not about a medical condition but a culinary practice.
|
227 |
+
Representative (non-medical) term
|
228 |
+
2025-07-31 07:13:51,634 - llm_clients - INFO - Query Latency: 9.2269 seconds
|
229 |
+
2025-07-31 07:13:51,634 - llm_clients - INFO - Extracted Condition: As a medical assistant, I don't address cooking techniques, but for context (not medical advice): In terms of relevance to health, the key aspect here isn't "proper" cooking per se, but rather avoiding overcooking that can reduce nutrient content. For whole-grain pasta, aim for al dente texture (firm, not mushy) to preserve fiber and other nutrients. However, this query is not about a medical condition but a culinary practice.
|
230 |
+
Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 2.02it/s]
|
231 |
+
2025-07-31 07:13:52,790 - retrieval - INFO - Sliding window search: Found 5 results
|
232 |
+
✅ Detected Level: 5
|
233 |
+
Condition: generic medical query
|
234 |
+
Emergency Keywords: medical|emergency
|
235 |
+
Treatment Keywords: treatment|management
|
236 |
+
Execution Time: 20.900s
|
237 |
+
⚠️ Test PARTIAL - ⚠️ Level 5 != expected 4. ⚠️ Query should have been rejected.
|
238 |
+
|
239 |
+
🔍 level4a_002: Level 4a: Technology query should be rejected
|
240 |
+
Query: 'best programming language to learn in 2025'
|
241 |
+
Expected Level: 4
|
242 |
+
----------------------------------------------------------------------
|
243 |
+
🎯 Executing multilevel fallback...
|
244 |
+
2025-07-31 07:13:52,799 - llm_clients - INFO - Calling Medical LLM with query: best programming language to learn in 2025
|
245 |
+
2025-07-31 07:14:02,339 - llm_clients - INFO - Raw LLM Response: As a medical assistant, I do not address technology or education preferences like "best programming language" (which is non-medical context); however, for clarity, this query is outside my biomedical scope. In 2025 or any current year, the choice of "best" programming language is subjective and depends on industry trends, personal goals, and specific use cases (e.g., web development, mobile apps, or data science). Popular choices include Python, JavaScript, and Java, but it's crucial
|
246 |
+
2025-07-31 07:14:02,339 - llm_clients - INFO - Query Latency: 9.5400 seconds
|
247 |
+
2025-07-31 07:14:02,339 - llm_clients - INFO - Extracted Condition: As a medical assistant, I do not address technology or education preferences like "best programming language" (which is non-medical context); however, for clarity, this query is outside my biomedical scope. In 2025 or any current year, the choice of "best" programming language is subjective and depends on industry trends, personal goals, and specific use cases (e.g., web development, mobile apps, or data science). Popular choices include Python, JavaScript, and Java, but it's crucial
|
248 |
+
2025-07-31 07:14:02,339 - user_prompt - INFO - Starting semantic search fallback for query: 'best programming language to learn in 2025'
|
249 |
+
Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 6.45it/s]
|
250 |
+
2025-07-31 07:14:02,974 - retrieval - INFO - Sliding window search: Found 5 results
|
251 |
+
2025-07-31 07:14:02,986 - user_prompt - INFO - Semantic search returned 5 results
|
252 |
+
Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 3.16it/s]
|
253 |
+
Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 44.42it/s]
|
254 |
+
Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 41.06it/s]
|
255 |
+
Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 57.97it/s]
|
256 |
+
Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 40.41it/s]
|
257 |
+
Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 57.85it/s]
|
258 |
+
Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 54.99it/s]
|
259 |
+
Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 55.63it/s]
|
260 |
+
2025-07-31 07:14:03,457 - user_prompt - INFO - Inferred condition: None
|
261 |
+
2025-07-31 07:14:03,457 - user_prompt - WARNING - Condition validation failed for: None
|
262 |
+
2025-07-31 07:14:03,457 - user_prompt - INFO - No suitable condition found in semantic search
|
263 |
+
2025-07-31 07:14:03,457 - llm_clients - INFO - Calling Medical LLM with query: best programming language to learn in 2025
|
264 |
+
2025-07-31 07:14:13,766 - llm_clients - INFO - Raw LLM Response: As a medical assistant, I don't analyze technology trends or recommend programming languages; however, for clarity's sake (though out of my medical scope), in 2021 (not 2025's future prediction, as I'm bound by current data), popular choices for learning include Python, JavaScript, and Java due to their versatility, wide adoption, and job market demand. Keep in mind this information is not medical advice but rather a layman's interpretation of tech trends.
|
265 |
+
|
266 |
+
Representative Condition (not
|
267 |
+
2025-07-31 07:14:13,766 - llm_clients - INFO - Query Latency: 10.3088 seconds
|
268 |
+
2025-07-31 07:14:13,767 - llm_clients - INFO - Extracted Condition: As a medical assistant, I don't analyze technology trends or recommend programming languages; however, for clarity's sake (though out of my medical scope), in 2021 (not 2025's future prediction, as I'm bound by current data), popular choices for learning include Python, JavaScript, and Java due to their versatility, wide adoption, and job market demand. Keep in mind this information is not medical advice but rather a layman's interpretation of tech trends.
|
269 |
+
Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 2.00it/s]
|
270 |
+
2025-07-31 07:14:14,884 - retrieval - INFO - Sliding window search: Found 5 results
|
271 |
+
✅ Detected Level: 5
|
272 |
+
Condition: generic medical query
|
273 |
+
Emergency Keywords: medical|emergency
|
274 |
+
Treatment Keywords: treatment|management
|
275 |
+
Execution Time: 22.107s
|
276 |
+
⚠️ Test PARTIAL - ⚠️ Level 5 != expected 4. ⚠️ Query should have been rejected.
|
277 |
+
|
278 |
+
🔍 level4a_003: Level 4a: Weather query should be rejected
|
279 |
+
Query: 'weather forecast for tomorrow'
|
280 |
+
Expected Level: 4
|
281 |
+
----------------------------------------------------------------------
|
282 |
+
🎯 Executing multilevel fallback...
|
283 |
+
2025-07-31 07:14:14,905 - llm_clients - INFO - Calling Medical LLM with query: weather forecast for tomorrow
|
284 |
+
2025-07-31 07:14:24,069 - llm_clients - INFO - Raw LLM Response: As a medical assistant, I do not address weather forecasts; however, for context clarification, this query is unrelated to medical conditions. The requested information here is about meteorology (weather prediction) rather than health or disease. There's no representative medical condition to provide in this case.
|
285 |
+
|
286 |
+
If, however, you were referring indirectly to weather-sensitive health conditions (e.g., heat exhaustion, cold-induced asthma exacerbation), the specific condition would depend on the actual weather forecast details (temperature, humidity, etc.)
|
287 |
+
2025-07-31 07:14:24,069 - llm_clients - INFO - Query Latency: 9.1634 seconds
|
288 |
+
2025-07-31 07:14:24,069 - llm_clients - INFO - Extracted Condition: As a medical assistant, I do not address weather forecasts; however, for context clarification, this query is unrelated to medical conditions. The requested information here is about meteorology (weather prediction) rather than health or disease. There's no representative medical condition to provide in this case.
|
289 |
+
2025-07-31 07:14:24,070 - user_prompt - INFO - Starting semantic search fallback for query: 'weather forecast for tomorrow'
|
290 |
+
Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 2.17it/s]
|
291 |
+
2025-07-31 07:14:25,222 - retrieval - INFO - Sliding window search: Found 5 results
|
292 |
+
2025-07-31 07:14:25,234 - user_prompt - INFO - Semantic search returned 5 results
|
293 |
+
Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 11.71it/s]
|
294 |
+
Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 50.65it/s]
|
295 |
+
Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 55.87it/s]
|
296 |
+
Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 50.21it/s]
|
297 |
+
Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 21.32it/s]
|
298 |
+
Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 54.77it/s]
|
299 |
+
Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 53.42it/s]
|
300 |
+
Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 56.34it/s]
|
301 |
+
2025-07-31 07:14:25,491 - user_prompt - INFO - Inferred condition: None
|
302 |
+
2025-07-31 07:14:25,491 - user_prompt - WARNING - Condition validation failed for: None
|
303 |
+
2025-07-31 07:14:25,491 - user_prompt - INFO - No suitable condition found in semantic search
|
304 |
+
2025-07-31 07:14:25,491 - llm_clients - INFO - Calling Medical LLM with query: weather forecast for tomorrow
|
305 |
+
2025-07-31 07:14:35,356 - llm_clients - INFO - Raw LLM Response: As a medical assistant, I do not address weather forecasts; however, for this context (to maintain representativeness in terms unrelated to diagnosis), the phrase here isn't indicative of a medical condition. Instead, it's about environmental information—specifically, a request for meteorological data (tomorrow's weather). In medical terminology, we wouldn't classify this as a condition, but for representation's sake in a non-medical context, it can be labeled as "meteorological inquiry" or simply
|
306 |
+
2025-07-31 07:14:35,356 - llm_clients - INFO - Query Latency: 9.8645 seconds
|
307 |
+
2025-07-31 07:14:35,356 - llm_clients - INFO - Extracted Condition: As a medical assistant, I do not address weather forecasts; however, for this context (to maintain representativeness in terms unrelated to diagnosis), the phrase here isn't indicative of a medical condition. Instead, it's about environmental information—specifically, a request for meteorological data (tomorrow's weather). In medical terminology, we wouldn't classify this as a condition, but for representation's sake in a non-medical context, it can be labeled as "meteorological inquiry" or simply
|
308 |
+
Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 11.19it/s]
|
309 |
+
2025-07-31 07:14:36,024 - retrieval - INFO - Sliding window search: Found 5 results
|
310 |
+
✅ Detected Level: 5
|
311 |
+
Condition: generic medical query
|
312 |
+
Emergency Keywords: medical|emergency
|
313 |
+
Treatment Keywords: treatment|management
|
314 |
+
Execution Time: 21.128s
|
315 |
+
⚠️ Test PARTIAL - ⚠️ Level 5 != expected 4. ⚠️ Query should have been rejected.
|
316 |
+
|
317 |
+
🔍 level4b_001: Level 4b→5: Obscure medical query passing validation to generic search
|
318 |
+
Query: 'rare hematologic malignancy treatment approaches'
|
319 |
+
Expected Level: 5
|
320 |
+
----------------------------------------------------------------------
|
321 |
+
🎯 Executing multilevel fallback...
|
322 |
+
2025-07-31 07:14:36,033 - llm_clients - INFO - Calling Medical LLM with query: rare hematologic malignancy treatment approaches
|
323 |
+
2025-07-31 07:14:45,301 - llm_clients - INFO - Raw LLM Response: The most representative condition: Myelofibrosis (or, in context of "rare" reference, could be an even less common variant like BCR-ABL1-negative atypical CML or unclassifiable myeloproliferative neoplasm)
|
324 |
+
|
325 |
+
- For myelofibrosis, primary treatment approaches include JAK2 inhibitors (e.g., ruxolitinib), supportive care (transfusions, erythropoiesis-stimulating agents), and allog
|
326 |
+
2025-07-31 07:14:45,302 - llm_clients - INFO - Query Latency: 9.2678 seconds
|
327 |
+
2025-07-31 07:14:45,302 - llm_clients - INFO - Extracted Condition: The most representative condition: Myelofibrosis (or, in context of "rare" reference, could be an even less common variant like BCR-ABL1-negative atypical CML or unclassifiable myeloproliferative neoplasm)
|
328 |
+
2025-07-31 07:14:45,302 - user_prompt - INFO - Starting semantic search fallback for query: 'rare hematologic malignancy treatment approaches'
|
329 |
+
Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 2.00it/s]
|
330 |
+
2025-07-31 07:14:46,428 - retrieval - INFO - Sliding window search: Found 5 results
|
331 |
+
2025-07-31 07:14:46,436 - user_prompt - INFO - Semantic search returned 5 results
|
332 |
+
Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 11.59it/s]
|
333 |
+
Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 38.61it/s]
|
334 |
+
Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 41.66it/s]
|
335 |
+
Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 61.40it/s]
|
336 |
+
Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 41.09it/s]
|
337 |
+
Batches: 100%|█████████████████████████████���████████████████████████████████████| 1/1 [00:00<00:00, 60.42it/s]
|
338 |
+
Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 58.98it/s]
|
339 |
+
Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 66.70it/s]
|
340 |
+
2025-07-31 07:14:46,672 - user_prompt - INFO - Inferred condition: None
|
341 |
+
2025-07-31 07:14:46,672 - user_prompt - WARNING - Condition validation failed for: None
|
342 |
+
2025-07-31 07:14:46,672 - user_prompt - INFO - No suitable condition found in semantic search
|
343 |
+
Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 54.28it/s]
|
344 |
+
2025-07-31 07:14:47,160 - retrieval - INFO - Sliding window search: Found 5 results
|
345 |
+
✅ Detected Level: 5
|
346 |
+
Condition: generic medical query
|
347 |
+
Emergency Keywords: medical|emergency
|
348 |
+
Treatment Keywords: treatment|management
|
349 |
+
Execution Time: 11.137s
|
350 |
+
🎉 Test PASSED - Expected behavior achieved
|
351 |
+
|
352 |
+
🔍 level4b_002: Level 4b→5: Rare condition requiring generic medical search
|
353 |
+
Query: 'idiopathic thrombocytopenic purpura management guidelines'
|
354 |
+
Expected Level: 5
|
355 |
+
----------------------------------------------------------------------
|
356 |
+
🎯 Executing multilevel fallback...
|
357 |
+
2025-07-31 07:14:47,170 - llm_clients - INFO - Calling Medical LLM with query: idiopathic thrombocytopenic purpura management guidelines
|
358 |
+
2025-07-31 07:14:56,483 - llm_clients - INFO - Raw LLM Response: The primary medical condition: Idiopathic Thrombocytopenic Purpura (ITP)
|
359 |
+
|
360 |
+
(As a medical assistant, I do not provide advice, but here's the relevant condition with context for a knowledge reference.)
|
361 |
+
In this case, the most representative condition is Idiopathic Thrombocytopenic Purpura (ITP), an autoimmune disorder characterized by low platelet count (thrombocytopenia) without identifiable underlying causes. Management guidelines typically involve
|
362 |
+
2025-07-31 07:14:56,484 - llm_clients - INFO - Query Latency: 9.3136 seconds
|
363 |
+
2025-07-31 07:14:56,484 - llm_clients - INFO - Extracted Condition: The primary medical condition: Idiopathic Thrombocytopenic Purpura (ITP)
|
364 |
+
2025-07-31 07:14:56,484 - user_prompt - INFO - Starting semantic search fallback for query: 'idiopathic thrombocytopenic purpura management guidelines'
|
365 |
+
Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 11.14it/s]
|
366 |
+
2025-07-31 07:14:57,082 - retrieval - INFO - Sliding window search: Found 5 results
|
367 |
+
2025-07-31 07:14:57,090 - user_prompt - INFO - Semantic search returned 5 results
|
368 |
+
Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 12.83it/s]
|
369 |
+
Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 51.94it/s]
|
370 |
+
Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 60.06it/s]
|
371 |
+
Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 65.59it/s]
|
372 |
+
Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 57.81it/s]
|
373 |
+
Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 61.78it/s]
|
374 |
+
Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 61.76it/s]
|
375 |
+
Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 61.14it/s]
|
376 |
+
2025-07-31 07:14:57,296 - user_prompt - INFO - Inferred condition: None
|
377 |
+
2025-07-31 07:14:57,296 - user_prompt - WARNING - Condition validation failed for: None
|
378 |
+
2025-07-31 07:14:57,296 - user_prompt - INFO - No suitable condition found in semantic search
|
379 |
+
2025-07-31 07:14:57,296 - llm_clients - INFO - Calling Medical LLM with query: idiopathic thrombocytopenic purpura management guidelines
|
380 |
+
2025-07-31 07:15:06,621 - llm_clients - INFO - Raw LLM Response: The primary medical condition: Idiopathic Thrombocytopenic Purpura (ITP)
|
381 |
+
|
382 |
+
(As a medical assistant, I don't provide advice, but describe the condition and point to standard guidelines. For ITP management, refer to professional sources like the American Society of Hematology [ASH] or National Institutes of Health [NIH].)
|
383 |
+
|
384 |
+
Idiopathic Thrombocytopenic Purpura (ITP) is an autoimmune disorder characterized by low platelet count
|
385 |
+
2025-07-31 07:15:06,621 - llm_clients - INFO - Query Latency: 9.3245 seconds
|
386 |
+
2025-07-31 07:15:06,621 - llm_clients - INFO - Extracted Condition: The primary medical condition: Idiopathic Thrombocytopenic Purpura (ITP)
|
387 |
+
Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 11.12it/s]
|
388 |
+
2025-07-31 07:15:07,391 - retrieval - INFO - Sliding window search: Found 5 results
|
389 |
+
✅ Detected Level: 5
|
390 |
+
Condition: generic medical query
|
391 |
+
Emergency Keywords: medical|emergency
|
392 |
+
Treatment Keywords: treatment|management
|
393 |
+
Execution Time: 20.228s
|
394 |
+
🎉 Test PASSED - Expected behavior achieved
|
395 |
+
|
396 |
+
🔍 level4b_003: Level 4b→5: Rare emergency condition → generic search
|
397 |
+
Query: 'necrotizing fasciitis surgical intervention protocols'
|
398 |
+
Expected Level: 5
|
399 |
+
----------------------------------------------------------------------
|
400 |
+
🎯 Executing multilevel fallback...
|
401 |
+
2025-07-31 07:15:07,398 - llm_clients - INFO - Calling Medical LLM with query: necrotizing fasciitis surgical intervention protocols
|
402 |
+
2025-07-31 07:15:16,625 - llm_clients - INFO - Raw LLM Response: The primary medical condition: Necrotizing Fasciitis
|
403 |
+
|
404 |
+
In this context, the key condition is Necrotizing Fasciitis, a severe bacterial infection characterized by rapid destruction of subcutaneous tissue and fascia. The term provided, "surgical intervention protocols," refers to the treatment approach rather than a distinct medical condition. However, for clarity in this answer, I'll address it as it pertains to managing Necrotizing Fasciitis.
|
405 |
+
|
406 |
+
In Necrotizing Fasciitis, surgical
|
407 |
+
2025-07-31 07:15:16,625 - llm_clients - INFO - Query Latency: 9.2271 seconds
|
408 |
+
2025-07-31 07:15:16,625 - llm_clients - INFO - Extracted Condition: The primary medical condition: Necrotizing Fasciitis
|
409 |
+
2025-07-31 07:15:16,625 - user_prompt - INFO - Starting semantic search fallback for query: 'necrotizing fasciitis surgical intervention protocols'
|
410 |
+
Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 10.01it/s]
|
411 |
+
2025-07-31 07:15:17,212 - retrieval - INFO - Sliding window search: Found 5 results
|
412 |
+
2025-07-31 07:15:17,222 - user_prompt - INFO - Semantic search returned 5 results
|
413 |
+
Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 12.01it/s]
|
414 |
+
Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 45.04it/s]
|
415 |
+
Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 44.57it/s]
|
416 |
+
Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 57.92it/s]
|
417 |
+
Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 48.15it/s]
|
418 |
+
Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 61.28it/s]
|
419 |
+
Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 60.83it/s]
|
420 |
+
Batches: 100%|███���██████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 61.38it/s]
|
421 |
+
2025-07-31 07:15:17,449 - user_prompt - INFO - Inferred condition: None
|
422 |
+
2025-07-31 07:15:17,449 - user_prompt - WARNING - Condition validation failed for: None
|
423 |
+
2025-07-31 07:15:17,449 - user_prompt - INFO - No suitable condition found in semantic search
|
424 |
+
2025-07-31 07:15:17,449 - llm_clients - INFO - Calling Medical LLM with query: necrotizing fasciitis surgical intervention protocols
|
425 |
+
2025-07-31 07:15:24,511 - llm_clients - INFO - Raw LLM Response: The most representative condition: Necrotizing Fasciitis
|
426 |
+
|
427 |
+
(As a medical assistant, I do not provide advice, only identify conditions. For necrotizing fasciitis, surgical intervention typically involves aggressive debridement—removing dead tissue—and may require repeated procedures until healthy margins are achieved. This is accompanied by supportive care and antibiotics.)
|
428 |
+
|
429 |
+
|
430 |
+
2025-07-31 07:15:24,511 - llm_clients - INFO - Query Latency: 7.0619 seconds
|
431 |
+
2025-07-31 07:15:24,511 - llm_clients - INFO - Extracted Condition: The most representative condition: Necrotizing Fasciitis
|
432 |
+
Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 13.83it/s]
|
433 |
+
2025-07-31 07:15:25,078 - retrieval - INFO - Sliding window search: Found 5 results
|
434 |
+
✅ Detected Level: 5
|
435 |
+
Condition: generic medical query
|
436 |
+
Emergency Keywords: medical|emergency
|
437 |
+
Treatment Keywords: treatment|management
|
438 |
+
Execution Time: 17.692s
|
439 |
+
🎉 Test PASSED - Expected behavior achieved
|
440 |
+
|
441 |
+
================================================================================
|
442 |
+
📊 MULTILEVEL FALLBACK TEST REPORT
|
443 |
+
================================================================================
|
444 |
+
🕐 Execution Summary:
|
445 |
+
Total duration: 187.465s
|
446 |
+
Average per test: 14.420s
|
447 |
+
|
448 |
+
📈 Test Results:
|
449 |
+
Total tests: 13
|
450 |
+
Passed: 7 ✅
|
451 |
+
Partial: 6 ⚠️
|
452 |
+
Failed: 6 ❌
|
453 |
+
Success rate: 53.8%
|
454 |
+
|
455 |
+
🎯 Level Distribution Analysis:
|
456 |
+
Level 1 (Predefined Mapping): 4 tests, avg 5.752s
|
457 |
+
Level 5 (Generic Search): 9 tests, avg 17.495s
|
458 |
+
|
459 |
+
📋 Category Analysis:
|
460 |
+
level1_predefined: 3/3 (100.0%)
|
461 |
+
level2_llm: 1/2 (50.0%)
|
462 |
+
level3_semantic: 0/2 (0.0%)
|
463 |
+
level4a_rejection: 0/3 (0.0%)
|
464 |
+
level4b_to_5: 3/3 (100.0%)
|
465 |
+
|
466 |
+
📝 Detailed Test Results:
|
467 |
+
|
468 |
+
level1_001: ✅ PASS
|
469 |
+
Query: 'acute myocardial infarction treatment'
|
470 |
+
Expected Level: 1
|
471 |
+
Detected Level: 1
|
472 |
+
Condition: acute myocardial infarction
|
473 |
+
Time: 0.000s
|
474 |
+
Validation: ✅ Level 1 as expected. ✅ Condition 'acute myocardial infarction' matches expected.
|
475 |
+
|
476 |
+
level1_002: ✅ PASS
|
477 |
+
Query: 'how to manage acute stroke?'
|
478 |
+
Expected Level: 1
|
479 |
+
Detected Level: 1
|
480 |
+
Condition: acute stroke
|
481 |
+
Time: 0.000s
|
482 |
+
Validation: ✅ Level 1 as expected. ✅ Condition 'acute stroke' matches expected.
|
483 |
+
|
484 |
+
level1_003: ✅ PASS
|
485 |
+
Query: 'pulmonary embolism emergency protocol'
|
486 |
+
Expected Level: 1
|
487 |
+
Detected Level: 1
|
488 |
+
Condition: pulmonary embolism
|
489 |
+
Time: 0.000s
|
490 |
+
Validation: ✅ Level 1 as expected. ✅ Condition 'pulmonary embolism' matches expected.
|
491 |
+
|
492 |
+
level2_001: ✅ PASS
|
493 |
+
Query: 'patient with severe crushing chest pain radiating to left arm'
|
494 |
+
Expected Level: 2
|
495 |
+
Detected Level: 1
|
496 |
+
Condition: acute myocardial infarction
|
497 |
+
Time: 23.008s
|
498 |
+
Validation: ⚠️ Level 1 != expected 2. ✅ Condition 'acute myocardial infarction' matches expected.
|
499 |
+
|
500 |
+
level2_002: ⚠️ PARTIAL
|
501 |
+
Query: 'sudden onset weakness on right side with speech difficulty'
|
502 |
+
Expected Level: 2
|
503 |
+
Detected Level: 5
|
504 |
+
Condition: generic medical query
|
505 |
+
Time: 22.223s
|
506 |
+
Validation: ⚠️ Level 5 != expected 2. ⚠️ Condition 'generic medical query' != expected ['acute stroke', 'cerebrovascular accident'].
|
507 |
+
|
508 |
+
level3_001: ⚠️ PARTIAL
|
509 |
+
Query: 'emergency management of cardiovascular crisis'
|
510 |
+
Expected Level: 3
|
511 |
+
Detected Level: 5
|
512 |
+
Condition: generic medical query
|
513 |
+
Time: 11.647s
|
514 |
+
Validation: ⚠️ Level 5 != expected 3. ⚠️ Condition 'generic medical query' != expected [].
|
515 |
+
|
516 |
+
level3_002: ⚠️ PARTIAL
|
517 |
+
Query: 'urgent neurological intervention protocols'
|
518 |
+
Expected Level: 3
|
519 |
+
Detected Level: 5
|
520 |
+
Condition: generic medical query
|
521 |
+
Time: 10.398s
|
522 |
+
Validation: ⚠️ Level 5 != expected 3. ⚠️ Condition 'generic medical query' != expected [].
|
523 |
+
|
524 |
+
level4a_001: ⚠️ PARTIAL
|
525 |
+
Query: 'how to cook pasta properly?'
|
526 |
+
Expected Level: 4
|
527 |
+
Detected Level: 5
|
528 |
+
Condition: generic medical query
|
529 |
+
Time: 20.900s
|
530 |
+
Validation: ⚠️ Level 5 != expected 4. ⚠️ Query should have been rejected.
|
531 |
+
|
532 |
+
level4a_002: ⚠️ PARTIAL
|
533 |
+
Query: 'best programming language to learn in 2025'
|
534 |
+
Expected Level: 4
|
535 |
+
Detected Level: 5
|
536 |
+
Condition: generic medical query
|
537 |
+
Time: 22.107s
|
538 |
+
Validation: ⚠️ Level 5 != expected 4. ⚠️ Query should have been rejected.
|
539 |
+
|
540 |
+
level4a_003: ⚠️ PARTIAL
|
541 |
+
Query: 'weather forecast for tomorrow'
|
542 |
+
Expected Level: 4
|
543 |
+
Detected Level: 5
|
544 |
+
Condition: generic medical query
|
545 |
+
Time: 21.128s
|
546 |
+
Validation: ⚠️ Level 5 != expected 4. ⚠️ Query should have been rejected.
|
547 |
+
|
548 |
+
level4b_001: ✅ PASS
|
549 |
+
Query: 'rare hematologic malignancy treatment approaches'
|
550 |
+
Expected Level: 5
|
551 |
+
Detected Level: 5
|
552 |
+
Condition: generic medical query
|
553 |
+
Time: 11.137s
|
554 |
+
Validation: ✅ Level 5 as expected. ✅ Generic medical search triggered.
|
555 |
+
|
556 |
+
level4b_002: ✅ PASS
|
557 |
+
Query: 'idiopathic thrombocytopenic purpura management guidelines'
|
558 |
+
Expected Level: 5
|
559 |
+
Detected Level: 5
|
560 |
+
Condition: generic medical query
|
561 |
+
Time: 20.228s
|
562 |
+
Validation: ✅ Level 5 as expected. ✅ Generic medical search triggered.
|
563 |
+
|
564 |
+
level4b_003: ✅ PASS
|
565 |
+
Query: 'necrotizing fasciitis surgical intervention protocols'
|
566 |
+
Expected Level: 5
|
567 |
+
Detected Level: 5
|
568 |
+
Condition: generic medical query
|
569 |
+
Time: 17.692s
|
570 |
+
Validation: ✅ Level 5 as expected. ✅ Generic medical search triggered.
|
tests/result_of_test_multlevel_fallback_validation_revised.md
ADDED
@@ -0,0 +1,534 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
🏥 OnCall.ai Multilevel Fallback Validation Test
|
2 |
+
============================================================
|
3 |
+
🔧 Initializing Components for Multilevel Fallback Test...
|
4 |
+
------------------------------------------------------------
|
5 |
+
1. Initializing Llama3-Med42-70B Client...
|
6 |
+
2025-07-31 07:51:06,059 - llm_clients - INFO - Medical LLM client initialized with model: m42-health/Llama3-Med42-70B
|
7 |
+
2025-07-31 07:51:06,059 - llm_clients - WARNING - Medical LLM Model: Research tool only. Not for professional medical diagnosis.
|
8 |
+
✅ LLM client initialized
|
9 |
+
2. Initializing Retrieval System...
|
10 |
+
2025-07-31 07:51:06,059 - retrieval - INFO - Initializing retrieval system...
|
11 |
+
2025-07-31 07:51:06,073 - sentence_transformers.SentenceTransformer - INFO - Use pytorch device_name: mps
|
12 |
+
2025-07-31 07:51:06,073 - sentence_transformers.SentenceTransformer - INFO - Load pretrained SentenceTransformer: NeuML/pubmedbert-base-embeddings
|
13 |
+
2025-07-31 07:51:09,264 - retrieval - INFO - Embedding model loaded successfully
|
14 |
+
2025-07-31 07:51:10,711 - retrieval - INFO - Chunks loaded successfully
|
15 |
+
2025-07-31 07:51:10,824 - retrieval - INFO - Embeddings loaded successfully
|
16 |
+
2025-07-31 07:51:10,825 - retrieval - INFO - Loaded existing emergency index
|
17 |
+
2025-07-31 07:51:10,826 - retrieval - INFO - Loaded existing treatment index
|
18 |
+
2025-07-31 07:51:10,826 - retrieval - INFO - Retrieval system initialized successfully
|
19 |
+
✅ Retrieval system initialized
|
20 |
+
3. Initializing User Prompt Processor...
|
21 |
+
2025-07-31 07:51:10,826 - sentence_transformers.SentenceTransformer - INFO - Use pytorch device_name: mps
|
22 |
+
2025-07-31 07:51:10,826 - sentence_transformers.SentenceTransformer - INFO - Load pretrained SentenceTransformer: NeuML/pubmedbert-base-embeddings
|
23 |
+
2025-07-31 07:51:12,702 - user_prompt - INFO - UserPromptProcessor initialized
|
24 |
+
✅ User prompt processor initialized
|
25 |
+
|
26 |
+
🎉 All components initialized successfully!
|
27 |
+
|
28 |
+
🚀 Starting Multilevel Fallback Test Suite
|
29 |
+
Total test cases: 13
|
30 |
+
Test started at: 2025-07-31 07:51:06
|
31 |
+
================================================================================
|
32 |
+
|
33 |
+
🔍 level1_001: Level 1: Direct predefined condition match
|
34 |
+
Query: 'acute myocardial infarction treatment'
|
35 |
+
Expected Level: 1
|
36 |
+
----------------------------------------------------------------------
|
37 |
+
🎯 Executing multilevel fallback...
|
38 |
+
2025-07-31 07:51:12,702 - user_prompt - INFO - Matched predefined condition: acute myocardial infarction
|
39 |
+
✅ Detected Level: 1
|
40 |
+
Condition: acute myocardial infarction
|
41 |
+
Emergency Keywords: MI|chest pain|cardiac arrest
|
42 |
+
Treatment Keywords: aspirin|nitroglycerin|thrombolytic|PCI
|
43 |
+
Execution Time: 0.000s
|
44 |
+
🎉 Test PASSED - Expected behavior achieved
|
45 |
+
|
46 |
+
🔍 level1_002: Level 1: Predefined stroke condition
|
47 |
+
Query: 'how to manage acute stroke?'
|
48 |
+
Expected Level: 1
|
49 |
+
----------------------------------------------------------------------
|
50 |
+
🎯 Executing multilevel fallback...
|
51 |
+
2025-07-31 07:51:12,702 - user_prompt - INFO - Matched predefined condition: acute stroke
|
52 |
+
✅ Detected Level: 1
|
53 |
+
Condition: acute stroke
|
54 |
+
Emergency Keywords: stroke|neurological deficit|sudden weakness
|
55 |
+
Treatment Keywords: tPA|thrombolysis|stroke unit care
|
56 |
+
Execution Time: 0.000s
|
57 |
+
🎉 Test PASSED - Expected behavior achieved
|
58 |
+
|
59 |
+
🔍 level1_003: Level 1: Predefined PE condition
|
60 |
+
Query: 'pulmonary embolism emergency protocol'
|
61 |
+
Expected Level: 1
|
62 |
+
----------------------------------------------------------------------
|
63 |
+
🎯 Executing multilevel fallback...
|
64 |
+
2025-07-31 07:51:12,702 - user_prompt - INFO - Matched predefined condition: pulmonary embolism
|
65 |
+
✅ Detected Level: 1
|
66 |
+
Condition: pulmonary embolism
|
67 |
+
Emergency Keywords: chest pain|shortness of breath|sudden dyspnea
|
68 |
+
Treatment Keywords: anticoagulation|heparin|embolectomy
|
69 |
+
Execution Time: 0.000s
|
70 |
+
🎉 Test PASSED - Expected behavior achieved
|
71 |
+
|
72 |
+
🔍 level2_001: Level 2: Symptom-based query requiring LLM analysis
|
73 |
+
Query: 'patient with severe crushing chest pain radiating to left arm'
|
74 |
+
Expected Level: 2
|
75 |
+
----------------------------------------------------------------------
|
76 |
+
🎯 Executing multilevel fallback...
|
77 |
+
2025-07-31 07:51:12,702 - llm_clients - INFO - Calling Medical LLM with query: patient with severe crushing chest pain radiating to left arm
|
78 |
+
2025-07-31 07:51:55,277 - llm_clients - INFO - Raw LLM Response: Medical: "Acute Myocardial Infarction" (Heart Attack)
|
79 |
+
Explanation: The described symptoms of severe crushing chest pain radiating to the left arm are highly indicative of an acute myocardial infarction, commonly known as a heart attack. This is a medical emergency caused by blockage of coronary arteries, disrupting blood supply to the heart muscle.
|
80 |
+
|
81 |
+
(Not providing advice, just categorizing the condition)
|
82 |
+
2025-07-31 07:51:55,278 - llm_clients - INFO - Query Latency: 42.5747 seconds
|
83 |
+
2025-07-31 07:51:55,278 - llm_clients - INFO - Extracted Condition: acute myocardial infarction
|
84 |
+
✅ Detected Level: 1
|
85 |
+
Condition: acute myocardial infarction
|
86 |
+
Emergency Keywords: MI|chest pain|cardiac arrest
|
87 |
+
Treatment Keywords: aspirin|nitroglycerin|thrombolytic|PCI
|
88 |
+
Execution Time: 42.576s
|
89 |
+
🎉 Test PASSED - Expected behavior achieved
|
90 |
+
|
91 |
+
🔍 level2_002: Level 2: Neurological symptoms requiring LLM
|
92 |
+
Query: 'sudden onset weakness on right side with speech difficulty'
|
93 |
+
Expected Level: 2
|
94 |
+
----------------------------------------------------------------------
|
95 |
+
🎯 Executing multilevel fallback...
|
96 |
+
2025-07-31 07:51:55,279 - llm_clients - INFO - Calling Medical LLM with query: sudden onset weakness on right side with speech difficulty
|
97 |
+
2025-07-31 07:52:06,165 - llm_clients - INFO - Raw LLM Response: Medical: "Acute Ischemic Stroke" (or Cerebrovascular Accident, specifically involving right hemispheric damage causing contralateral weakness and speech impairment)
|
98 |
+
|
99 |
+
Explanation: The symptoms described - sudden onset weakness on the right side (implying left brain hemisphere involvement due to contralateral motor control) and speech difficulty - are classic indicators of an acute ischemic stroke. This condition occurs when blood flow to a region of the brain is blocked, depriving it of oxygen and nutrients,
|
100 |
+
2025-07-31 07:52:06,165 - llm_clients - INFO - Query Latency: 10.8864 seconds
|
101 |
+
2025-07-31 07:52:06,165 - llm_clients - INFO - Extracted Condition: Medical: "Acute Ischemic Stroke" (or Cerebrovascular Accident, specifically involving right hemispheric damage causing contralateral weakness and speech impairment)
|
102 |
+
2025-07-31 07:52:06,166 - user_prompt - INFO - Starting semantic search fallback for query: 'sudden onset weakness on right side with speech difficulty'
|
103 |
+
Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 1.61it/s]
|
104 |
+
2025-07-31 07:52:07,568 - retrieval - INFO - Sliding window search: Found 5 results
|
105 |
+
2025-07-31 07:52:07,575 - user_prompt - INFO - Semantic search returned 5 results
|
106 |
+
Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 14.70it/s]
|
107 |
+
Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 17.71it/s]
|
108 |
+
Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 17.64it/s]
|
109 |
+
Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 61.46it/s]
|
110 |
+
Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 16.59it/s]
|
111 |
+
Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 57.61it/s]
|
112 |
+
Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 57.26it/s]
|
113 |
+
Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 56.86it/s]
|
114 |
+
2025-07-31 07:52:07,896 - user_prompt - INFO - Inferred condition: None
|
115 |
+
2025-07-31 07:52:07,896 - user_prompt - WARNING - Condition validation failed for: None
|
116 |
+
2025-07-31 07:52:07,896 - user_prompt - INFO - No suitable condition found in semantic search
|
117 |
+
2025-07-31 07:52:07,897 - llm_clients - INFO - Calling Medical LLM with query: sudden onset weakness on right side with speech difficulty
|
118 |
+
2025-07-31 07:52:16,923 - llm_clients - INFO - Raw LLM Response: Medical: "Cerebrovascular Accident (CVA) - Ischemic Stroke" (or simply "Ischemic Stroke" for brevity, as it's the most specific diagnosis here)
|
119 |
+
- Explanation: The symptoms described, sudden right-sided weakness and speech difficulty, are classic indicators of an ischemic stroke, which occurs when blood flow to the brain is blocked by a clot or narrowed blood vessels.
|
120 |
+
|
121 |
+
Note: While hemorrhagic stroke is another type of CVA, the given symptoms
|
122 |
+
2025-07-31 07:52:16,923 - llm_clients - INFO - Query Latency: 9.0264 seconds
|
123 |
+
2025-07-31 07:52:16,923 - llm_clients - INFO - Extracted Condition: Medical: "Cerebrovascular Accident (CVA) - Ischemic Stroke" (or simply "Ischemic Stroke" for brevity, as it's the most specific diagnosis here)
|
124 |
+
Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 1.89it/s]
|
125 |
+
2025-07-31 07:52:17,964 - retrieval - INFO - Sliding window search: Found 5 results
|
126 |
+
✅ Detected Level: 5
|
127 |
+
Condition: generic medical query
|
128 |
+
Emergency Keywords: medical|emergency
|
129 |
+
Treatment Keywords: treatment|management
|
130 |
+
Execution Time: 22.751s
|
131 |
+
⚠️ Test PARTIAL - ⚠️ Level 5 != expected 2. ⚠️ Condition 'generic medical query' != expected ['acute stroke', 'cerebrovascular accident'].
|
132 |
+
|
133 |
+
🔍 level3_001: Level 3: Generic medical terms requiring semantic search
|
134 |
+
Query: 'emergency management of cardiovascular crisis'
|
135 |
+
Expected Level: 3
|
136 |
+
----------------------------------------------------------------------
|
137 |
+
🎯 Executing multilevel fallback...
|
138 |
+
2025-07-31 07:52:18,030 - llm_clients - INFO - Calling Medical LLM with query: emergency management of cardiovascular crisis
|
139 |
+
2025-07-31 07:52:27,145 - llm_clients - INFO - Raw LLM Response: Medical: "Cardiovascular crisis" in this context (emergency management) is best represented by "Acute Myocardial Infarction (AMI)" or "ST-Elevation Myocardial Infarction (STEMI)," as both terms describe severe, time-critical cardiac events requiring immediate intervention. However, if considering a broader "cardiovascular crisis" that's not limited to infarction, "Cardiogenic Shock" might also be applicable, as it represents a severe, life
|
140 |
+
2025-07-31 07:52:27,145 - llm_clients - INFO - Query Latency: 9.1143 seconds
|
141 |
+
2025-07-31 07:52:27,145 - llm_clients - INFO - Extracted Condition: acute myocardial infarction
|
142 |
+
✅ Detected Level: 1
|
143 |
+
Condition: acute myocardial infarction
|
144 |
+
Emergency Keywords: MI|chest pain|cardiac arrest
|
145 |
+
Treatment Keywords: aspirin|nitroglycerin|thrombolytic|PCI
|
146 |
+
Execution Time: 9.115s
|
147 |
+
⚠️ Test PARTIAL - ⚠️ Level 1 != expected 3. ⚠️ Condition 'acute myocardial infarction' != expected [].
|
148 |
+
|
149 |
+
🔍 level3_002: Level 3: Medical terminology requiring semantic fallback
|
150 |
+
Query: 'urgent neurological intervention protocols'
|
151 |
+
Expected Level: 3
|
152 |
+
----------------------------------------------------------------------
|
153 |
+
🎯 Executing multilevel fallback...
|
154 |
+
2025-07-31 07:52:27,145 - llm_clients - INFO - Calling Medical LLM with query: urgent neurological intervention protocols
|
155 |
+
2025-07-31 07:52:37,615 - llm_clients - INFO - Raw LLM Response: Medical: "Acute Ischemic Stroke" (representing a condition requiring urgent neurological intervention, specifically thrombectomy or thrombolysis protocols)
|
156 |
+
|
157 |
+
Explanation: Acute ischemic stroke necessitates rapid medical response, as timely interventions like mechanical thrombectomy or intravenous thrombolysis can significantly improve patient outcomes. The term "urgent neurological intervention protocols" in this context likely refers to these treatments for stroke, making "Acute Ischemic Stroke" the most representative medical condition.
|
158 |
+
2025-07-31 07:52:37,615 - llm_clients - INFO - Query Latency: 10.4695 seconds
|
159 |
+
2025-07-31 07:52:37,615 - llm_clients - INFO - Extracted Condition: Medical: "Acute Ischemic Stroke" (representing a condition requiring urgent neurological intervention, specifically thrombectomy or thrombolysis protocols)
|
160 |
+
2025-07-31 07:52:37,616 - user_prompt - INFO - Starting semantic search fallback for query: 'urgent neurological intervention protocols'
|
161 |
+
Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 3.45it/s]
|
162 |
+
2025-07-31 07:52:38,539 - retrieval - INFO - Sliding window search: Found 5 results
|
163 |
+
2025-07-31 07:52:38,549 - user_prompt - INFO - Semantic search returned 5 results
|
164 |
+
Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 13.55it/s]
|
165 |
+
Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 49.53it/s]
|
166 |
+
Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 52.73it/s]
|
167 |
+
Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 64.13it/s]
|
168 |
+
Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 51.36it/s]
|
169 |
+
Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 61.40it/s]
|
170 |
+
Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 58.10it/s]
|
171 |
+
Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 62.29it/s]
|
172 |
+
2025-07-31 07:52:38,759 - user_prompt - INFO - Inferred condition: None
|
173 |
+
2025-07-31 07:52:38,759 - user_prompt - WARNING - Condition validation failed for: None
|
174 |
+
2025-07-31 07:52:38,759 - user_prompt - INFO - No suitable condition found in semantic search
|
175 |
+
Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 16.09it/s]
|
176 |
+
2025-07-31 07:52:39,345 - retrieval - INFO - Sliding window search: Found 5 results
|
177 |
+
✅ Detected Level: 5
|
178 |
+
Condition: generic medical query
|
179 |
+
Emergency Keywords: medical|emergency
|
180 |
+
Treatment Keywords: treatment|management
|
181 |
+
Execution Time: 12.249s
|
182 |
+
⚠️ Test PARTIAL - ⚠️ Level 5 != expected 3. ⚠️ Condition 'generic medical query' != expected [].
|
183 |
+
|
184 |
+
🔍 level4a_001: Level 4a: Non-medical query should be rejected
|
185 |
+
Query: 'how to cook pasta properly?'
|
186 |
+
Expected Level: 4
|
187 |
+
----------------------------------------------------------------------
|
188 |
+
🎯 Executing multilevel fallback...
|
189 |
+
2025-07-31 07:52:39,395 - llm_clients - INFO - Calling Medical LLM with query: how to cook pasta properly?
|
190 |
+
2025-07-31 07:52:45,753 - llm_clients - INFO - Raw LLM Response: NON_MEDICAL_QUERY. This inquiry is about culinary technique (cooking pasta) and not related to medical conditions or health issues. It does not represent a medical topic for diagnosis or advice. Instead, it's a question of food preparation, typically addressed in cookbooks or culinary resources.
|
191 |
+
2025-07-31 07:52:45,753 - llm_clients - INFO - Query Latency: 6.3575 seconds
|
192 |
+
2025-07-31 07:52:45,753 - llm_clients - INFO - Extracted Condition:
|
193 |
+
2025-07-31 07:52:45,753 - user_prompt - INFO - Starting semantic search fallback for query: 'how to cook pasta properly?'
|
194 |
+
Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 1.47it/s]
|
195 |
+
2025-07-31 07:52:47,084 - retrieval - INFO - Sliding window search: Found 5 results
|
196 |
+
2025-07-31 07:52:47,091 - user_prompt - INFO - Semantic search returned 5 results
|
197 |
+
Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 14.54it/s]
|
198 |
+
Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 46.74it/s]
|
199 |
+
Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 46.14it/s]
|
200 |
+
Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 57.37it/s]
|
201 |
+
Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 43.26it/s]
|
202 |
+
Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 60.53it/s]
|
203 |
+
Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 58.35it/s]
|
204 |
+
Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 60.17it/s]
|
205 |
+
2025-07-31 07:52:47,305 - user_prompt - INFO - Inferred condition: None
|
206 |
+
2025-07-31 07:52:47,305 - user_prompt - WARNING - Condition validation failed for: None
|
207 |
+
2025-07-31 07:52:47,305 - user_prompt - INFO - No suitable condition found in semantic search
|
208 |
+
2025-07-31 07:52:47,305 - llm_clients - INFO - Calling Medical LLM with query: how to cook pasta properly?
|
209 |
+
2025-07-31 07:52:53,999 - llm_clients - INFO - Raw LLM Response: NON_MEDICAL_QUERY. This inquiry is about culinary technique (cooking pasta) and not related to any medical condition or health issue. It involves instructions on food preparation rather than addressing a disease, symptom, or medical concern.
|
210 |
+
2025-07-31 07:52:53,999 - llm_clients - INFO - Query Latency: 6.6933 seconds
|
211 |
+
2025-07-31 07:52:53,999 - llm_clients - INFO - Extracted Condition:
|
212 |
+
✅ Detected Level: 4
|
213 |
+
Condition: None
|
214 |
+
Emergency Keywords: None
|
215 |
+
Treatment Keywords: None
|
216 |
+
Execution Time: 14.604s
|
217 |
+
🎉 Test PASSED - Expected behavior achieved
|
218 |
+
|
219 |
+
🔍 level4a_002: Level 4a: Technology query should be rejected
|
220 |
+
Query: 'best programming language to learn in 2025'
|
221 |
+
Expected Level: 4
|
222 |
+
----------------------------------------------------------------------
|
223 |
+
🎯 Executing multilevel fallback...
|
224 |
+
2025-07-31 07:52:54,000 - llm_clients - INFO - Calling Medical LLM with query: best programming language to learn in 2025
|
225 |
+
2025-07-31 07:53:00,100 - llm_clients - INFO - Raw LLM Response: NON_MEDICAL_QUERY. This inquiry is about technology (specifically, programming languages) and their future relevance, rather than a medical condition or health topic. It doesn't pertain to diagnosis, treatment, or any medical aspect. Therefore, it's not a medical query.
|
226 |
+
2025-07-31 07:53:00,100 - llm_clients - INFO - Query Latency: 6.1004 seconds
|
227 |
+
2025-07-31 07:53:00,100 - llm_clients - INFO - Extracted Condition:
|
228 |
+
2025-07-31 07:53:00,100 - user_prompt - INFO - Starting semantic search fallback for query: 'best programming language to learn in 2025'
|
229 |
+
Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 2.94it/s]
|
230 |
+
2025-07-31 07:53:00,968 - retrieval - INFO - Sliding window search: Found 5 results
|
231 |
+
2025-07-31 07:53:01,048 - user_prompt - INFO - Semantic search returned 5 results
|
232 |
+
Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 13.26it/s]
|
233 |
+
Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 56.37it/s]
|
234 |
+
Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 56.33it/s]
|
235 |
+
Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 61.62it/s]
|
236 |
+
Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 56.23it/s]
|
237 |
+
Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 58.59it/s]
|
238 |
+
Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 58.53it/s]
|
239 |
+
Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 59.69it/s]
|
240 |
+
2025-07-31 07:53:01,255 - user_prompt - INFO - Inferred condition: None
|
241 |
+
2025-07-31 07:53:01,255 - user_prompt - WARNING - Condition validation failed for: None
|
242 |
+
2025-07-31 07:53:01,255 - user_prompt - INFO - No suitable condition found in semantic search
|
243 |
+
2025-07-31 07:53:01,256 - llm_clients - INFO - Calling Medical LLM with query: best programming language to learn in 2025
|
244 |
+
2025-07-31 07:53:06,397 - llm_clients - INFO - Raw LLM Response: NON_MEDICAL_QUERY. This inquiry is about selecting a programming language for future learning (in 2025) and has no relation to medical conditions or healthcare. It falls under the domain of computer science and technology education.
|
245 |
+
2025-07-31 07:53:06,397 - llm_clients - INFO - Query Latency: 5.1410 seconds
|
246 |
+
2025-07-31 07:53:06,397 - llm_clients - INFO - Extracted Condition:
|
247 |
+
✅ Detected Level: 4
|
248 |
+
Condition: None
|
249 |
+
Emergency Keywords: None
|
250 |
+
Treatment Keywords: None
|
251 |
+
Execution Time: 12.397s
|
252 |
+
🎉 Test PASSED - Expected behavior achieved
|
253 |
+
|
254 |
+
🔍 level4a_003: Level 4a: Weather query should be rejected
|
255 |
+
Query: 'weather forecast for tomorrow'
|
256 |
+
Expected Level: 4
|
257 |
+
----------------------------------------------------------------------
|
258 |
+
🎯 Executing multilevel fallback...
|
259 |
+
2025-07-31 07:53:06,397 - llm_clients - INFO - Calling Medical LLM with query: weather forecast for tomorrow
|
260 |
+
2025-07-31 07:53:11,119 - llm_clients - INFO - Raw LLM Response: NON_MEDICAL_QUERY. This inquiry is about meteorological information (weather prediction) and not related to any medical condition or health topic. It falls under environmental or general information, not medicine.
|
261 |
+
2025-07-31 07:53:11,120 - llm_clients - INFO - Query Latency: 4.7219 seconds
|
262 |
+
2025-07-31 07:53:11,120 - llm_clients - INFO - Extracted Condition:
|
263 |
+
2025-07-31 07:53:11,120 - user_prompt - INFO - Starting semantic search fallback for query: 'weather forecast for tomorrow'
|
264 |
+
Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 2.01it/s]
|
265 |
+
2025-07-31 07:53:12,200 - retrieval - INFO - Sliding window search: Found 5 results
|
266 |
+
2025-07-31 07:53:12,209 - user_prompt - INFO - Semantic search returned 5 results
|
267 |
+
Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 13.36it/s]
|
268 |
+
Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 51.03it/s]
|
269 |
+
Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 53.14it/s]
|
270 |
+
Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 63.88it/s]
|
271 |
+
Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 55.51it/s]
|
272 |
+
Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 61.11it/s]
|
273 |
+
Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 62.37it/s]
|
274 |
+
Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 65.13it/s]
|
275 |
+
2025-07-31 07:53:12,415 - user_prompt - INFO - Inferred condition: None
|
276 |
+
2025-07-31 07:53:12,415 - user_prompt - WARNING - Condition validation failed for: None
|
277 |
+
2025-07-31 07:53:12,415 - user_prompt - INFO - No suitable condition found in semantic search
|
278 |
+
2025-07-31 07:53:12,415 - llm_clients - INFO - Calling Medical LLM with query: weather forecast for tomorrow
|
279 |
+
2025-07-31 07:53:17,281 - llm_clients - INFO - Raw LLM Response: NON_MEDICAL_QUERY. This inquiry is about meteorology (predicting weather conditions) and not related to medical conditions or health issues. It doesn't involve symptoms, diagnoses, or any aspect of healthcare.
|
280 |
+
2025-07-31 07:53:17,281 - llm_clients - INFO - Query Latency: 4.8653 seconds
|
281 |
+
2025-07-31 07:53:17,281 - llm_clients - INFO - Extracted Condition:
|
282 |
+
✅ Detected Level: 4
|
283 |
+
Condition: None
|
284 |
+
Emergency Keywords: None
|
285 |
+
Treatment Keywords: None
|
286 |
+
Execution Time: 10.884s
|
287 |
+
🎉 Test PASSED - Expected behavior achieved
|
288 |
+
|
289 |
+
🔍 level4b_001: Level 4b→5: Obscure medical query passing validation to generic search
|
290 |
+
Query: 'rare hematologic malignancy treatment approaches'
|
291 |
+
Expected Level: 5
|
292 |
+
----------------------------------------------------------------------
|
293 |
+
🎯 Executing multilevel fallback...
|
294 |
+
2025-07-31 07:53:17,282 - llm_clients - INFO - Calling Medical LLM with query: rare hematologic malignancy treatment approaches
|
295 |
+
2025-07-31 07:53:26,329 - llm_clients - INFO - Raw LLM Response: Medical: "rare hematologic malignancy treatment approaches" → "Targeted Therapy for Agnogenic Myeloid Metaplasia (or currently, 'Agnogenic/Idiopathic: Myelofibrosis' in modern classification, as part of CMML-excluded rare myeloproliferative neoplasms) or, alternatively, 'Chimeric Antigen Receptor T-Cell therapy (CAR-T) for rare B-cell lymphomas like Primary Mediastinal
|
296 |
+
2025-07-31 07:53:26,329 - llm_clients - INFO - Query Latency: 9.0470 seconds
|
297 |
+
2025-07-31 07:53:26,331 - llm_clients - INFO - Extracted Condition: Medical: "rare hematologic malignancy treatment approaches" → "Targeted Therapy for Agnogenic Myeloid Metaplasia (or currently, 'Agnogenic/Idiopathic: Myelofibrosis' in modern classification, as part of CMML-excluded rare myeloproliferative neoplasms) or, alternatively, 'Chimeric Antigen Receptor T-Cell therapy (CAR-T) for rare B-cell lymphomas like Primary Mediastinal
|
298 |
+
2025-07-31 07:53:26,331 - user_prompt - INFO - Starting semantic search fallback for query: 'rare hematologic malignancy treatment approaches'
|
299 |
+
Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 11.05it/s]
|
300 |
+
2025-07-31 07:53:26,871 - retrieval - INFO - Sliding window search: Found 5 results
|
301 |
+
2025-07-31 07:53:26,880 - user_prompt - INFO - Semantic search returned 5 results
|
302 |
+
Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 12.32it/s]
|
303 |
+
Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 57.77it/s]
|
304 |
+
Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 60.97it/s]
|
305 |
+
Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 62.97it/s]
|
306 |
+
Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 59.97it/s]
|
307 |
+
Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 58.87it/s]
|
308 |
+
Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 58.11it/s]
|
309 |
+
Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 62.43it/s]
|
310 |
+
2025-07-31 07:53:27,089 - user_prompt - INFO - Inferred condition: None
|
311 |
+
2025-07-31 07:53:27,089 - user_prompt - WARNING - Condition validation failed for: None
|
312 |
+
2025-07-31 07:53:27,089 - user_prompt - INFO - No suitable condition found in semantic search
|
313 |
+
Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 44.43it/s]
|
314 |
+
2025-07-31 07:53:27,626 - retrieval - INFO - Sliding window search: Found 5 results
|
315 |
+
✅ Detected Level: 5
|
316 |
+
Condition: generic medical query
|
317 |
+
Emergency Keywords: medical|emergency
|
318 |
+
Treatment Keywords: treatment|management
|
319 |
+
Execution Time: 10.356s
|
320 |
+
🎉 Test PASSED - Expected behavior achieved
|
321 |
+
|
322 |
+
🔍 level4b_002: Level 4b→5: Rare condition requiring generic medical search
|
323 |
+
Query: 'idiopathic thrombocytopenic purpura management guidelines'
|
324 |
+
Expected Level: 5
|
325 |
+
----------------------------------------------------------------------
|
326 |
+
🎯 Executing multilevel fallback...
|
327 |
+
2025-07-31 07:53:27,638 - llm_clients - INFO - Calling Medical LLM with query: idiopathic thrombocytopenic purpura management guidelines
|
328 |
+
2025-07-31 07:53:36,704 - llm_clients - INFO - Raw LLM Response: Medical: "Idiopathic Thrombocytopenic Purpura (ITP) Management" → "ITP Treatment Protocols" (referring to guidelines for managing this autoimmune platelet disorder, which include corticosteroids, IVIG, thrombopoietin receptor agonists, or splenectomy in certain cases)
|
329 |
+
|
330 |
+
Explanation: This query is medical because it pertains to the guidelines for treating a specific blood disorder, Idiopathic Thrombocytopenic
|
331 |
+
2025-07-31 07:53:36,704 - llm_clients - INFO - Query Latency: 9.0658 seconds
|
332 |
+
2025-07-31 07:53:36,704 - llm_clients - INFO - Extracted Condition: Medical: "Idiopathic Thrombocytopenic Purpura (ITP) Management" → "ITP Treatment Protocols" (referring to guidelines for managing this autoimmune platelet disorder, which include corticosteroids, IVIG, thrombopoietin receptor agonists, or splenectomy in certain cases)
|
333 |
+
2025-07-31 07:53:36,704 - user_prompt - INFO - Starting semantic search fallback for query: 'idiopathic thrombocytopenic purpura management guidelines'
|
334 |
+
Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 10.61it/s]
|
335 |
+
2025-07-31 07:53:37,450 - retrieval - INFO - Sliding window search: Found 5 results
|
336 |
+
2025-07-31 07:53:37,459 - user_prompt - INFO - Semantic search returned 5 results
|
337 |
+
Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 11.72it/s]
|
338 |
+
Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 44.43it/s]
|
339 |
+
Batches: 100%|███████████████████████��███████████████████████████████████| 1/1 [00:00<00:00, 41.51it/s]
|
340 |
+
Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 60.38it/s]
|
341 |
+
Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 43.72it/s]
|
342 |
+
Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 59.74it/s]
|
343 |
+
Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 59.79it/s]
|
344 |
+
Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 60.22it/s]
|
345 |
+
2025-07-31 07:53:37,691 - user_prompt - INFO - Inferred condition: None
|
346 |
+
2025-07-31 07:53:37,691 - user_prompt - WARNING - Condition validation failed for: None
|
347 |
+
2025-07-31 07:53:37,691 - user_prompt - INFO - No suitable condition found in semantic search
|
348 |
+
2025-07-31 07:53:37,691 - llm_clients - INFO - Calling Medical LLM with query: idiopathic thrombocytopenic purpura management guidelines
|
349 |
+
2025-07-31 07:53:47,836 - llm_clients - INFO - Raw LLM Response: Medical: "Idiopathic Thrombocytopenic Purpura (ITP) Management" → "ITP Treatment Protocols" (referring to guidelines for therapy in this autoimmune platelet disorder, which may include corticosteroids, intravenous immunoglobulin, or thrombopoietin receptor agonists, among other strategies).
|
350 |
+
|
351 |
+
Explanation: This query is medical because it discusses guidelines for managing a specific blood disorder, idiopathic thrombocytopenic purpura
|
352 |
+
2025-07-31 07:53:47,836 - llm_clients - INFO - Query Latency: 10.1445 seconds
|
353 |
+
2025-07-31 07:53:47,836 - llm_clients - INFO - Extracted Condition: Medical: "Idiopathic Thrombocytopenic Purpura (ITP) Management" → "ITP Treatment Protocols" (referring to guidelines for therapy in this autoimmune platelet disorder, which may include corticosteroids, intravenous immunoglobulin, or thrombopoietin receptor agonists, among other strategies).
|
354 |
+
Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 2.06it/s]
|
355 |
+
2025-07-31 07:53:48,812 - retrieval - INFO - Sliding window search: Found 5 results
|
356 |
+
✅ Detected Level: 5
|
357 |
+
Condition: generic medical query
|
358 |
+
Emergency Keywords: medical|emergency
|
359 |
+
Treatment Keywords: treatment|management
|
360 |
+
Execution Time: 21.183s
|
361 |
+
🎉 Test PASSED - Expected behavior achieved
|
362 |
+
|
363 |
+
🔍 level4b_003: Level 4b→5: Rare emergency condition → generic search
|
364 |
+
Query: 'necrotizing fasciitis surgical intervention protocols'
|
365 |
+
Expected Level: 5
|
366 |
+
----------------------------------------------------------------------
|
367 |
+
🎯 Executing multilevel fallback...
|
368 |
+
2025-07-31 07:53:48,821 - llm_clients - INFO - Calling Medical LLM with query: necrotizing fasciitis surgical intervention protocols
|
369 |
+
2025-07-31 07:53:57,799 - llm_clients - INFO - Raw LLM Response: Medical: "Necrotizing Fasciitis" - In this context, the primary medical condition is Necrotizing Fasciitis, a severe soft tissue infection characterized by rapid progression and tissue death. The phrase "surgical intervention protocols" refers to the medical procedures and guidelines for surgically managing this condition, typically involving debridement (removal of dead tissue) and sometimes amputation.
|
370 |
+
|
371 |
+
Explanation: This query is medical because it pertains to a specific infectious disease (Necrotizing
|
372 |
+
2025-07-31 07:53:57,799 - llm_clients - INFO - Query Latency: 8.9777 seconds
|
373 |
+
2025-07-31 07:53:57,800 - llm_clients - INFO - Extracted Condition: Medical: "Necrotizing Fasciitis" - In this context, the primary medical condition is Necrotizing Fasciitis, a severe soft tissue infection characterized by rapid progression and tissue death. The phrase "surgical intervention protocols" refers to the medical procedures and guidelines for surgically managing this condition, typically involving debridement (removal of dead tissue) and sometimes amputation.
|
374 |
+
2025-07-31 07:53:57,800 - user_prompt - INFO - Starting semantic search fallback for query: 'necrotizing fasciitis surgical intervention protocols'
|
375 |
+
Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 13.58it/s]
|
376 |
+
2025-07-31 07:53:58,405 - retrieval - INFO - Sliding window search: Found 5 results
|
377 |
+
2025-07-31 07:53:58,414 - user_prompt - INFO - Semantic search returned 5 results
|
378 |
+
Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 11.81it/s]
|
379 |
+
Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 48.09it/s]
|
380 |
+
Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 47.49it/s]
|
381 |
+
Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 62.11it/s]
|
382 |
+
Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 46.81it/s]
|
383 |
+
Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 60.57it/s]
|
384 |
+
Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 59.03it/s]
|
385 |
+
Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 62.29it/s]
|
386 |
+
2025-07-31 07:53:58,638 - user_prompt - INFO - Inferred condition: None
|
387 |
+
2025-07-31 07:53:58,638 - user_prompt - WARNING - Condition validation failed for: None
|
388 |
+
2025-07-31 07:53:58,638 - user_prompt - INFO - No suitable condition found in semantic search
|
389 |
+
2025-07-31 07:53:58,638 - llm_clients - INFO - Calling Medical LLM with query: necrotizing fasciitis surgical intervention protocols
|
390 |
+
2025-07-31 07:53:58,758 - llm_clients - ERROR - Medical LLM query error: 402 Client Error: Payment Required for url: https://router.huggingface.co/featherless-ai/v1/chat/completions (Request ID: Root=1-688b8386-259e81a24556b80a163e3d17;5ec89b2d-e0da-4255-90b6-f0c7e9577b38)
|
391 |
+
|
392 |
+
You have exceeded your monthly included credits for Inference Providers. Subscribe to PRO to get 20x more monthly included credits.
|
393 |
+
2025-07-31 07:53:58,758 - llm_clients - ERROR - Error Type: HfHubHTTPError
|
394 |
+
2025-07-31 07:53:58,758 - llm_clients - ERROR - Detailed Error: HfHubHTTPError('402 Client Error: Payment Required for url: https://router.huggingface.co/featherless-ai/v1/chat/completions (Request ID: Root=1-688b8386-259e81a24556b80a163e3d17;5ec89b2d-e0da-4255-90b6-f0c7e9577b38)\n\nYou have exceeded your monthly included credits for Inference Providers. Subscribe to PRO to get 20x more monthly included credits.')
|
395 |
+
2025-07-31 07:53:58,758 - llm_clients - ERROR - Query Latency (on error): 0.1196 seconds
|
396 |
+
2025-07-31 07:53:58,758 - llm_clients - ERROR - Query that caused error: necrotizing fasciitis surgical intervention protocols
|
397 |
+
✅ Detected Level: 4
|
398 |
+
Condition: None
|
399 |
+
Emergency Keywords: None
|
400 |
+
Treatment Keywords: None
|
401 |
+
Execution Time: 9.937s
|
402 |
+
⚠️ Test PARTIAL - ⚠️ Level 4 != expected 5. ⚠️ Should trigger generic medical search.
|
403 |
+
|
404 |
+
================================================================================
|
405 |
+
📊 MULTILEVEL FALLBACK TEST REPORT
|
406 |
+
================================================================================
|
407 |
+
🕐 Execution Summary:
|
408 |
+
Total duration: 172.699s
|
409 |
+
Average per test: 13.285s
|
410 |
+
|
411 |
+
📈 Test Results:
|
412 |
+
Total tests: 13
|
413 |
+
Passed: 9 ✅
|
414 |
+
Partial: 4 ⚠️
|
415 |
+
Failed: 4 ❌
|
416 |
+
Success rate: 69.2%
|
417 |
+
|
418 |
+
🎯 Level Distribution Analysis:
|
419 |
+
Level 1 (Predefined Mapping): 5 tests, avg 10.338s
|
420 |
+
Level 4 (Validation Rejection): 4 tests, avg 11.956s
|
421 |
+
Level 5 (Generic Search): 4 tests, avg 16.635s
|
422 |
+
|
423 |
+
📋 Category Analysis:
|
424 |
+
level1_predefined: 3/3 (100.0%)
|
425 |
+
level2_llm: 1/2 (50.0%)
|
426 |
+
level3_semantic: 0/2 (0.0%)
|
427 |
+
level4a_rejection: 3/3 (100.0%)
|
428 |
+
level4b_to_5: 2/3 (66.7%)
|
429 |
+
|
430 |
+
📝 Detailed Test Results:
|
431 |
+
|
432 |
+
level1_001: ✅ PASS
|
433 |
+
Query: 'acute myocardial infarction treatment'
|
434 |
+
Expected Level: 1
|
435 |
+
Detected Level: 1
|
436 |
+
Condition: acute myocardial infarction
|
437 |
+
Time: 0.000s
|
438 |
+
Validation: ✅ Level 1 as expected. ✅ Condition 'acute myocardial infarction' matches expected.
|
439 |
+
|
440 |
+
level1_002: ✅ PASS
|
441 |
+
Query: 'how to manage acute stroke?'
|
442 |
+
Expected Level: 1
|
443 |
+
Detected Level: 1
|
444 |
+
Condition: acute stroke
|
445 |
+
Time: 0.000s
|
446 |
+
Validation: ✅ Level 1 as expected. ✅ Condition 'acute stroke' matches expected.
|
447 |
+
|
448 |
+
level1_003: ✅ PASS
|
449 |
+
Query: 'pulmonary embolism emergency protocol'
|
450 |
+
Expected Level: 1
|
451 |
+
Detected Level: 1
|
452 |
+
Condition: pulmonary embolism
|
453 |
+
Time: 0.000s
|
454 |
+
Validation: ✅ Level 1 as expected. ✅ Condition 'pulmonary embolism' matches expected.
|
455 |
+
|
456 |
+
level2_001: ✅ PASS
|
457 |
+
Query: 'patient with severe crushing chest pain radiating to left arm'
|
458 |
+
Expected Level: 2
|
459 |
+
Detected Level: 1
|
460 |
+
Condition: acute myocardial infarction
|
461 |
+
Time: 42.576s
|
462 |
+
Validation: ⚠️ Level 1 != expected 2. ✅ Condition 'acute myocardial infarction' matches expected.
|
463 |
+
|
464 |
+
level2_002: ⚠️ PARTIAL
|
465 |
+
Query: 'sudden onset weakness on right side with speech difficulty'
|
466 |
+
Expected Level: 2
|
467 |
+
Detected Level: 5
|
468 |
+
Condition: generic medical query
|
469 |
+
Time: 22.751s
|
470 |
+
Validation: ⚠️ Level 5 != expected 2. ⚠️ Condition 'generic medical query' != expected ['acute stroke', 'cerebrovascular accident'].
|
471 |
+
|
472 |
+
level3_001: ⚠️ PARTIAL
|
473 |
+
Query: 'emergency management of cardiovascular crisis'
|
474 |
+
Expected Level: 3
|
475 |
+
Detected Level: 1
|
476 |
+
Condition: acute myocardial infarction
|
477 |
+
Time: 9.115s
|
478 |
+
Validation: ⚠️ Level 1 != expected 3. ⚠️ Condition 'acute myocardial infarction' != expected [].
|
479 |
+
|
480 |
+
level3_002: ⚠️ PARTIAL
|
481 |
+
Query: 'urgent neurological intervention protocols'
|
482 |
+
Expected Level: 3
|
483 |
+
Detected Level: 5
|
484 |
+
Condition: generic medical query
|
485 |
+
Time: 12.249s
|
486 |
+
Validation: ⚠️ Level 5 != expected 3. ⚠️ Condition 'generic medical query' != expected [].
|
487 |
+
|
488 |
+
level4a_001: ✅ PASS
|
489 |
+
Query: 'how to cook pasta properly?'
|
490 |
+
Expected Level: 4
|
491 |
+
Detected Level: 4
|
492 |
+
Condition: None
|
493 |
+
Time: 14.604s
|
494 |
+
Validation: ✅ Level 4 as expected. ✅ Query correctly rejected.
|
495 |
+
|
496 |
+
level4a_002: ✅ PASS
|
497 |
+
Query: 'best programming language to learn in 2025'
|
498 |
+
Expected Level: 4
|
499 |
+
Detected Level: 4
|
500 |
+
Condition: None
|
501 |
+
Time: 12.397s
|
502 |
+
Validation: ✅ Level 4 as expected. ✅ Query correctly rejected.
|
503 |
+
|
504 |
+
level4a_003: ✅ PASS
|
505 |
+
Query: 'weather forecast for tomorrow'
|
506 |
+
Expected Level: 4
|
507 |
+
Detected Level: 4
|
508 |
+
Condition: None
|
509 |
+
Time: 10.884s
|
510 |
+
Validation: ✅ Level 4 as expected. ✅ Query correctly rejected.
|
511 |
+
|
512 |
+
level4b_001: ✅ PASS
|
513 |
+
Query: 'rare hematologic malignancy treatment approaches'
|
514 |
+
Expected Level: 5
|
515 |
+
Detected Level: 5
|
516 |
+
Condition: generic medical query
|
517 |
+
Time: 10.356s
|
518 |
+
Validation: ✅ Level 5 as expected. ✅ Generic medical search triggered.
|
519 |
+
|
520 |
+
level4b_002: ✅ PASS
|
521 |
+
Query: 'idiopathic thrombocytopenic purpura management guidelines'
|
522 |
+
Expected Level: 5
|
523 |
+
Detected Level: 5
|
524 |
+
Condition: generic medical query
|
525 |
+
Time: 21.183s
|
526 |
+
Validation: ✅ Level 5 as expected. ✅ Generic medical search triggered.
|
527 |
+
|
528 |
+
level4b_003: ⚠️ PARTIAL
|
529 |
+
Query: 'necrotizing fasciitis surgical intervention protocols'
|
530 |
+
Expected Level: 5
|
531 |
+
Detected Level: 4
|
532 |
+
Condition: None
|
533 |
+
Time: 9.937s
|
534 |
+
Validation: ⚠️ Level 4 != expected 5. ⚠️ Should trigger generic medical search.
|
tests/result_of_test_userinput_userprompt_medical_condition_llm.md
ADDED
@@ -0,0 +1,381 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
🏥 OnCall.ai Medical Query Processing Pipeline Test
|
2 |
+
============================================================
|
3 |
+
🔧 Initializing Pipeline Components...
|
4 |
+
--------------------------------------------------
|
5 |
+
1. Initializing Llama3-Med42-70B Client...
|
6 |
+
2025-07-31 06:38:22,609 - llm_clients - INFO - Medical LLM client initialized with model: m42-health/Llama3-Med42-70B
|
7 |
+
2025-07-31 06:38:22,609 - llm_clients - WARNING - Medical LLM Model: Research tool only. Not for professional medical diagnosis.
|
8 |
+
✅ LLM client initialized successfully
|
9 |
+
2. Initializing Retrieval System...
|
10 |
+
2025-07-31 06:38:22,609 - retrieval - INFO - Initializing retrieval system...
|
11 |
+
2025-07-31 06:38:22,621 - sentence_transformers.SentenceTransformer - INFO - Use pytorch device_name: mps
|
12 |
+
2025-07-31 06:38:22,621 - sentence_transformers.SentenceTransformer - INFO - Load pretrained SentenceTransformer: NeuML/pubmedbert-base-embeddings
|
13 |
+
2025-07-31 06:38:26,965 - retrieval - INFO - Embedding model loaded successfully
|
14 |
+
2025-07-31 06:38:28,444 - retrieval - INFO - Chunks loaded successfully
|
15 |
+
2025-07-31 06:38:28,532 - retrieval - INFO - Embeddings loaded successfully
|
16 |
+
2025-07-31 06:38:28,533 - retrieval - INFO - Loaded existing emergency index
|
17 |
+
2025-07-31 06:38:28,534 - retrieval - INFO - Loaded existing treatment index
|
18 |
+
2025-07-31 06:38:28,534 - retrieval - INFO - Retrieval system initialized successfully
|
19 |
+
✅ Retrieval system initialized successfully
|
20 |
+
3. Initializing User Prompt Processor...
|
21 |
+
2025-07-31 06:38:28,534 - sentence_transformers.SentenceTransformer - INFO - Use pytorch device_name: mps
|
22 |
+
2025-07-31 06:38:28,534 - sentence_transformers.SentenceTransformer - INFO - Load pretrained SentenceTransformer: NeuML/pubmedbert-base-embeddings
|
23 |
+
2025-07-31 06:38:30,716 - user_prompt - INFO - UserPromptProcessor initialized
|
24 |
+
✅ User prompt processor initialized successfully
|
25 |
+
|
26 |
+
🎉 All components initialized successfully!
|
27 |
+
|
28 |
+
🚀 Starting Comprehensive Pipeline Test
|
29 |
+
Total test cases: 6
|
30 |
+
Test started at: 2025-07-31 06:38:22
|
31 |
+
================================================================================
|
32 |
+
|
33 |
+
🔍 test_001: Classic acute myocardial infarction query
|
34 |
+
Query: 'how to treat acute MI?'
|
35 |
+
------------------------------------------------------------
|
36 |
+
Step 1: Extracting medical condition and keywords...
|
37 |
+
2025-07-31 06:38:30,716 - llm_clients - INFO - Calling Medical LLM with query: how to treat acute MI?
|
38 |
+
2025-07-31 06:39:12,449 - llm_clients - INFO - Raw LLM Response: The most representative condition: Acute Myocardial Infarction (AMI, or Heart Attack)
|
39 |
+
|
40 |
+
For treatment guidance: Acute myocardial infarction is managed by cardiologists and emergency medical teams, not medical assistants. However, for informational purposes, primary treatments include:
|
41 |
+
1. Reperfusion therapy: This may involve fibrinolysis (clot-busting medications) or percutaneous coronary intervention (PCI, such as angioplasty and stenting).
|
42 |
+
2. Antiplatelet therapy
|
43 |
+
2025-07-31 06:39:12,450 - llm_clients - INFO - Query Latency: 41.7327 seconds
|
44 |
+
2025-07-31 06:39:12,450 - llm_clients - INFO - Extracted Condition: acute myocardial infarction
|
45 |
+
Condition: acute myocardial infarction
|
46 |
+
Emergency keywords: MI|chest pain|cardiac arrest
|
47 |
+
Treatment keywords: aspirin|nitroglycerin|thrombolytic|PCI
|
48 |
+
Source: predefined_mapping
|
49 |
+
Duration: 41.734s
|
50 |
+
|
51 |
+
Step 2: User confirmation process...
|
52 |
+
Confirmation type: confirmation_needed
|
53 |
+
|
54 |
+
Step 3: Executing retrieval...
|
55 |
+
Batches: 100%|███████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 1.46it/s]
|
56 |
+
2025-07-31 06:39:13,227 - retrieval - INFO - Search results: Emergency=5, Treatment=5
|
57 |
+
2025-07-31 06:39:13,228 - retrieval - INFO - Deduplication: Processing 10 results using text matching
|
58 |
+
2025-07-31 06:39:13,228 - retrieval - INFO - Deduplication summary: 10 → 9 results (removed 1)
|
59 |
+
Search query: 'MI|chest pain|cardiac arrest aspirin|nitroglycerin|thrombolytic|PCI'
|
60 |
+
Total results: 9
|
61 |
+
Emergency results: 4
|
62 |
+
Treatment results: 5
|
63 |
+
Duration: 0.778s
|
64 |
+
|
65 |
+
Top 3 results:
|
66 |
+
1. Type: treatment, Distance: 0.6740
|
67 |
+
Text preview: ong term management abbreviations : ace : angiotensin converting enzyme ; arb : angiotensin receptor...
|
68 |
+
2. Type: treatment, Distance: 0.6792
|
69 |
+
Text preview: on ; pci : percutaneous coronary intervention ; po : per os ; stemi : st elevation myocardial infarc...
|
70 |
+
3. Type: treatment, Distance: 0.6904
|
71 |
+
Text preview: receptor blocker ; mi : myocardial infarction # do ' s - a pre - hospital ecg is recommended. if ste...
|
72 |
+
|
73 |
+
✅ Test test_001 completed successfully (42.511s)
|
74 |
+
|
75 |
+
🔍 test_002: Symptoms-based query requiring LLM analysis
|
76 |
+
Query: 'patient with severe chest pain and shortness of breath'
|
77 |
+
------------------------------------------------------------
|
78 |
+
Step 1: Extracting medical condition and keywords...
|
79 |
+
2025-07-31 06:39:13,228 - llm_clients - INFO - Calling Medical LLM with query: patient with severe chest pain and shortness of breath
|
80 |
+
2025-07-31 06:39:31,525 - llm_clients - INFO - Raw LLM Response: Acute Coronary Syndrome (specifically, possible ST-Elevation Myocardial Infarction - STEMI, given severe chest pain, or non-STEMI/NST-Elevation Acute Coronary Syndrome if ST segments not elevated, based on ECG; shortness of breath indicates potential cardiac ischemia complication or concurrent pulmonary issue like cardiogenic pulmonary edema)
|
81 |
+
|
82 |
+
Note: This response is for informational purposes only and should not replace immediate medical evaluation and diagnosis by a licensed physician. The patient needs
|
83 |
+
2025-07-31 06:39:31,525 - llm_clients - INFO - Query Latency: 18.2971 seconds
|
84 |
+
2025-07-31 06:39:31,525 - llm_clients - INFO - Extracted Condition: Acute Coronary Syndrome (specifically, possible ST-Elevation Myocardial Infarction - STEMI, given severe chest pain, or non-STEMI/NST-Elevation Acute Coronary Syndrome if ST segments not elevated, based on ECG; shortness of breath indicates potential cardiac ischemia complication or concurrent pulmonary issue like cardiogenic pulmonary edema)
|
85 |
+
2025-07-31 06:39:31,525 - user_prompt - INFO - Starting semantic search fallback for query: 'patient with severe chest pain and shortness of breath'
|
86 |
+
Batches: 100%|███████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 7.70it/s]
|
87 |
+
2025-07-31 06:39:32,392 - retrieval - INFO - Sliding window search: Found 5 results
|
88 |
+
2025-07-31 06:39:32,402 - user_prompt - INFO - Semantic search returned 5 results
|
89 |
+
Batches: 100%|███████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 13.86it/s]
|
90 |
+
Batches: 100%|███████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 17.53it/s]
|
91 |
+
Batches: 100%|███████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 17.22it/s]
|
92 |
+
Batches: 100%|███████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 57.51it/s]
|
93 |
+
Batches: 100%|███████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 17.23it/s]
|
94 |
+
Batches: 100%|███████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 58.05it/s]
|
95 |
+
Batches: 100%|███████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 59.09it/s]
|
96 |
+
Batches: 100%|███████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 60.88it/s]
|
97 |
+
2025-07-31 06:39:32,729 - user_prompt - INFO - Inferred condition: None
|
98 |
+
2025-07-31 06:39:32,729 - user_prompt - WARNING - Condition validation failed for: None
|
99 |
+
2025-07-31 06:39:32,729 - user_prompt - INFO - No suitable condition found in semantic search
|
100 |
+
Batches: 100%|███████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 16.77it/s]
|
101 |
+
2025-07-31 06:39:33,251 - retrieval - INFO - Sliding window search: Found 5 results
|
102 |
+
Condition: generic medical query
|
103 |
+
Emergency keywords: medical|emergency
|
104 |
+
Treatment keywords: treatment|management
|
105 |
+
Source: generic_search
|
106 |
+
Duration: 20.033s
|
107 |
+
|
108 |
+
Step 2: User confirmation process...
|
109 |
+
Confirmation type: confirmation_needed
|
110 |
+
|
111 |
+
Step 3: Executing retrieval...
|
112 |
+
Batches: 100%|███████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 16.28it/s]
|
113 |
+
2025-07-31 06:39:33,404 - retrieval - INFO - Search results: Emergency=5, Treatment=5
|
114 |
+
2025-07-31 06:39:33,404 - retrieval - INFO - Deduplication: Processing 10 results using text matching
|
115 |
+
2025-07-31 06:39:33,404 - retrieval - INFO - Deduplication summary: 10 → 9 results (removed 1)
|
116 |
+
Search query: 'medical|emergency treatment|management'
|
117 |
+
Total results: 9
|
118 |
+
Emergency results: 5
|
119 |
+
Treatment results: 4
|
120 |
+
Duration: 0.143s
|
121 |
+
|
122 |
+
Top 3 results:
|
123 |
+
1. Type: treatment, Distance: 0.7708
|
124 |
+
Text preview: and nurse practitioners who may or may not be formally trained in emergency medicine. they offer pri...
|
125 |
+
2. Type: emergency, Distance: 0.8056
|
126 |
+
Text preview: organization of emergency medical assistance emergency medical assistance is the first aid that is g...
|
127 |
+
3. Type: emergency, Distance: 0.8321
|
128 |
+
Text preview: ion to the emergency room ; - urgent situation that requires advanced medical care before transporta...
|
129 |
+
|
130 |
+
✅ Test test_002 completed successfully (20.176s)
|
131 |
+
|
132 |
+
🔍 test_003: Neurological emergency query
|
133 |
+
Query: 'sudden neurological symptoms suggesting stroke'
|
134 |
+
------------------------------------------------------------
|
135 |
+
Step 1: Extracting medical condition and keywords...
|
136 |
+
2025-07-31 06:39:33,404 - llm_clients - INFO - Calling Medical LLM with query: sudden neurological symptoms suggesting stroke
|
137 |
+
2025-07-31 06:39:49,400 - llm_clients - INFO - Raw LLM Response: Cerebrovascular Accident (CVA), or Acute Ischemic Stroke
|
138 |
+
|
139 |
+
(As a medical assistant, I'm limited to providing condition labels, not advice. In this case, the description given—sudden neurological symptoms suggestive of stroke—points to an acute ischemic stroke, also known as cerebrovascular accident (CVA). This diagnosis implies a blockage of blood flow to the brain, resulting in sudden neurological deficits.)
|
140 |
+
|
141 |
+
**Please consult a qualified healthcare professional for evaluation and management.
|
142 |
+
2025-07-31 06:39:49,403 - llm_clients - INFO - Query Latency: 15.9960 seconds
|
143 |
+
2025-07-31 06:39:49,404 - llm_clients - INFO - Extracted Condition: Cerebrovascular Accident (CVA), or Acute Ischemic Stroke
|
144 |
+
2025-07-31 06:39:49,405 - user_prompt - INFO - Starting semantic search fallback for query: 'sudden neurological symptoms suggesting stroke'
|
145 |
+
Batches: 100%|███████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 8.53it/s]
|
146 |
+
2025-07-31 06:39:50,205 - retrieval - INFO - Sliding window search: Found 5 results
|
147 |
+
2025-07-31 06:39:50,214 - user_prompt - INFO - Semantic search returned 5 results
|
148 |
+
Batches: 100%|███████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 13.55it/s]
|
149 |
+
Batches: 100%|███████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 55.19it/s]
|
150 |
+
Batches: 100%|███████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 55.05it/s]
|
151 |
+
Batches: 100%|███████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 62.50it/s]
|
152 |
+
Batches: 100%|███████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 61.67it/s]
|
153 |
+
Batches: 100%|███████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 62.14it/s]
|
154 |
+
Batches: 100%|███████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 59.27it/s]
|
155 |
+
Batches: 100%|███████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 59.62it/s]
|
156 |
+
2025-07-31 06:39:50,417 - user_prompt - INFO - Inferred condition: None
|
157 |
+
2025-07-31 06:39:50,418 - user_prompt - WARNING - Condition validation failed for: None
|
158 |
+
2025-07-31 06:39:50,418 - user_prompt - INFO - No suitable condition found in semantic search
|
159 |
+
Batches: 100%|███████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 15.16it/s]
|
160 |
+
2025-07-31 06:39:50,938 - retrieval - INFO - Sliding window search: Found 5 results
|
161 |
+
Condition: generic medical query
|
162 |
+
Emergency keywords: medical|emergency
|
163 |
+
Treatment keywords: treatment|management
|
164 |
+
Source: generic_search
|
165 |
+
Duration: 17.544s
|
166 |
+
|
167 |
+
Step 2: User confirmation process...
|
168 |
+
Confirmation type: confirmation_needed
|
169 |
+
|
170 |
+
Step 3: Executing retrieval...
|
171 |
+
Batches: 100%|███████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 46.02it/s]
|
172 |
+
2025-07-31 06:39:50,972 - retrieval - INFO - Search results: Emergency=5, Treatment=5
|
173 |
+
2025-07-31 06:39:50,972 - retrieval - INFO - Deduplication: Processing 10 results using text matching
|
174 |
+
2025-07-31 06:39:50,972 - retrieval - INFO - Deduplication summary: 10 → 9 results (removed 1)
|
175 |
+
Search query: 'medical|emergency treatment|management'
|
176 |
+
Total results: 9
|
177 |
+
Emergency results: 5
|
178 |
+
Treatment results: 4
|
179 |
+
Duration: 0.025s
|
180 |
+
|
181 |
+
Top 3 results:
|
182 |
+
1. Type: treatment, Distance: 0.7708
|
183 |
+
Text preview: and nurse practitioners who may or may not be formally trained in emergency medicine. they offer pri...
|
184 |
+
2. Type: emergency, Distance: 0.8056
|
185 |
+
Text preview: organization of emergency medical assistance emergency medical assistance is the first aid that is g...
|
186 |
+
3. Type: emergency, Distance: 0.8321
|
187 |
+
Text preview: ion to the emergency room ; - urgent situation that requires advanced medical care before transporta...
|
188 |
+
|
189 |
+
✅ Test test_003 completed successfully (17.569s)
|
190 |
+
|
191 |
+
🔍 test_004: Protocol-specific stroke query
|
192 |
+
Query: 'acute stroke management protocol'
|
193 |
+
------------------------------------------------------------
|
194 |
+
Step 1: Extracting medical condition and keywords...
|
195 |
+
2025-07-31 06:39:50,973 - user_prompt - INFO - Matched predefined condition: acute stroke
|
196 |
+
Condition: acute stroke
|
197 |
+
Emergency keywords: stroke|neurological deficit|sudden weakness
|
198 |
+
Treatment keywords: tPA|thrombolysis|stroke unit care
|
199 |
+
Source: predefined_mapping
|
200 |
+
Duration: 0.000s
|
201 |
+
|
202 |
+
Step 2: User confirmation process...
|
203 |
+
Confirmation type: confirmation_needed
|
204 |
+
|
205 |
+
Step 3: Executing retrieval...
|
206 |
+
Batches: 100%|███████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 15.92it/s]
|
207 |
+
2025-07-31 06:39:51,110 - retrieval - INFO - Search results: Emergency=5, Treatment=5
|
208 |
+
2025-07-31 06:39:51,110 - retrieval - INFO - Deduplication: Processing 10 results using text matching
|
209 |
+
2025-07-31 06:39:51,110 - retrieval - INFO - Deduplication summary: 10 → 9 results (removed 1)
|
210 |
+
Search query: 'stroke|neurological deficit|sudden weakness tPA|thrombolysis|stroke unit care'
|
211 |
+
Total results: 9
|
212 |
+
Emergency results: 5
|
213 |
+
Treatment results: 4
|
214 |
+
Duration: 0.137s
|
215 |
+
|
216 |
+
Top 3 results:
|
217 |
+
1. Type: treatment, Distance: 0.7389
|
218 |
+
Text preview: hree hours of the onset of stroke. early treatment ( within 90 minutes ) may be more likely to resul...
|
219 |
+
2. Type: treatment, Distance: 0.7401
|
220 |
+
Text preview: hree hours of the onset of stroke. early treatment ( within 90 minutes ) may be more likely to resul...
|
221 |
+
3. Type: emergency, Distance: 0.7685
|
222 |
+
Text preview: mproved outcomes for a broad spectrum of carefully selected clients who can be treated within three ...
|
223 |
+
|
224 |
+
✅ Test test_004 completed successfully (0.137s)
|
225 |
+
|
226 |
+
🔍 test_005: General symptom requiring LLM analysis
|
227 |
+
Query: 'patient presenting with acute abdominal pain'
|
228 |
+
------------------------------------------------------------
|
229 |
+
Step 1: Extracting medical condition and keywords...
|
230 |
+
2025-07-31 06:39:51,110 - llm_clients - INFO - Calling Medical LLM with query: patient presenting with acute abdominal pain
|
231 |
+
2025-07-31 06:40:00,096 - llm_clients - INFO - Raw LLM Response: Acute Appendicitis
|
232 |
+
|
233 |
+
(As a medical assistant, I identify the most representative condition here as acute appendicitis, given the patient's symptom of acute abdominal pain, particularly if localized in the right lower quadrant and accompanied by other typical signs like nausea, vomiting, fever, or guarding. However, this is not a definitive diagnosis and should be confirmed by a physician through clinical evaluation, imaging, or surgical findings.)
|
234 |
+
2025-07-31 06:40:00,096 - llm_clients - INFO - Query Latency: 8.9862 seconds
|
235 |
+
2025-07-31 06:40:00,097 - llm_clients - INFO - Extracted Condition: Acute Appendicitis
|
236 |
+
2025-07-31 06:40:00,097 - user_prompt - INFO - Starting semantic search fallback for query: 'patient presenting with acute abdominal pain'
|
237 |
+
Batches: 100%|███████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 10.49it/s]
|
238 |
+
2025-07-31 06:40:00,664 - retrieval - INFO - Sliding window search: Found 5 results
|
239 |
+
2025-07-31 06:40:00,673 - user_prompt - INFO - Semantic search returned 5 results
|
240 |
+
Batches: 100%|███████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 15.57it/s]
|
241 |
+
Batches: 100%|███████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 50.55it/s]
|
242 |
+
Batches: 100%|███████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 47.08it/s]
|
243 |
+
Batches: 100%|███████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 62.74it/s]
|
244 |
+
Batches: 100%|███████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 45.91it/s]
|
245 |
+
Batches: 100%|███████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 61.25it/s]
|
246 |
+
Batches: 100%|███████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 58.38it/s]
|
247 |
+
Batches: 100%|███████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 64.09it/s]
|
248 |
+
2025-07-31 06:40:00,876 - user_prompt - INFO - Inferred condition: None
|
249 |
+
2025-07-31 06:40:00,876 - user_prompt - WARNING - Condition validation failed for: None
|
250 |
+
2025-07-31 06:40:00,876 - user_prompt - INFO - No suitable condition found in semantic search
|
251 |
+
Batches: 100%|███████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 16.32it/s]
|
252 |
+
2025-07-31 06:40:01,399 - retrieval - INFO - Sliding window search: Found 5 results
|
253 |
+
Condition: generic medical query
|
254 |
+
Emergency keywords: medical|emergency
|
255 |
+
Treatment keywords: treatment|management
|
256 |
+
Source: generic_search
|
257 |
+
Duration: 10.298s
|
258 |
+
|
259 |
+
Step 2: User confirmation process...
|
260 |
+
Confirmation type: confirmation_needed
|
261 |
+
|
262 |
+
Step 3: Executing retrieval...
|
263 |
+
Batches: 100%|███████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 45.41it/s]
|
264 |
+
2025-07-31 06:40:01,432 - retrieval - INFO - Search results: Emergency=5, Treatment=5
|
265 |
+
2025-07-31 06:40:01,432 - retrieval - INFO - Deduplication: Processing 10 results using text matching
|
266 |
+
2025-07-31 06:40:01,432 - retrieval - INFO - Deduplication summary: 10 → 9 results (removed 1)
|
267 |
+
Search query: 'medical|emergency treatment|management'
|
268 |
+
Total results: 9
|
269 |
+
Emergency results: 5
|
270 |
+
Treatment results: 4
|
271 |
+
Duration: 0.025s
|
272 |
+
|
273 |
+
Top 3 results:
|
274 |
+
1. Type: treatment, Distance: 0.7708
|
275 |
+
Text preview: and nurse practitioners who may or may not be formally trained in emergency medicine. they offer pri...
|
276 |
+
2. Type: emergency, Distance: 0.8056
|
277 |
+
Text preview: organization of emergency medical assistance emergency medical assistance is the first aid that is g...
|
278 |
+
3. Type: emergency, Distance: 0.8321
|
279 |
+
Text preview: ion to the emergency room ; - urgent situation that requires advanced medical care before transporta...
|
280 |
+
|
281 |
+
✅ Test test_005 completed successfully (10.322s)
|
282 |
+
|
283 |
+
🔍 test_006: Specific condition with treatment focus
|
284 |
+
Query: 'pulmonary embolism treatment guidelines'
|
285 |
+
------------------------------------------------------------
|
286 |
+
Step 1: Extracting medical condition and keywords...
|
287 |
+
2025-07-31 06:40:01,432 - user_prompt - INFO - Matched predefined condition: pulmonary embolism
|
288 |
+
Condition: pulmonary embolism
|
289 |
+
Emergency keywords: chest pain|shortness of breath|sudden dyspnea
|
290 |
+
Treatment keywords: anticoagulation|heparin|embolectomy
|
291 |
+
Source: predefined_mapping
|
292 |
+
Duration: 0.000s
|
293 |
+
|
294 |
+
Step 2: User confirmation process...
|
295 |
+
Confirmation type: confirmation_needed
|
296 |
+
|
297 |
+
Step 3: Executing retrieval...
|
298 |
+
Batches: 100%|███████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 16.27it/s]
|
299 |
+
2025-07-31 06:40:01,562 - retrieval - INFO - Search results: Emergency=5, Treatment=5
|
300 |
+
2025-07-31 06:40:01,562 - retrieval - INFO - Deduplication: Processing 10 results using text matching
|
301 |
+
2025-07-31 06:40:01,562 - retrieval - INFO - Deduplication summary: 10 → 8 results (removed 2)
|
302 |
+
Search query: 'chest pain|shortness of breath|sudden dyspnea anticoagulation|heparin|embolectomy'
|
303 |
+
Total results: 8
|
304 |
+
Emergency results: 5
|
305 |
+
Treatment results: 3
|
306 |
+
Duration: 0.130s
|
307 |
+
|
308 |
+
Top 3 results:
|
309 |
+
1. Type: emergency, Distance: 0.8949
|
310 |
+
Text preview: algesics ( e. g. morphine, pethidine ) facilities for defibrillation ( df ) aspirin / anticoagulant ...
|
311 |
+
2. Type: treatment, Distance: 0.9196
|
312 |
+
Text preview: y proximal deep vein thrombosis leading to acute pulmonary embolism # # common causes of peripheral ...
|
313 |
+
3. Type: emergency, Distance: 0.9216
|
314 |
+
Text preview: ed or discolored skin in the affected leg - visible surface veins dvt usually involves the deep vein...
|
315 |
+
|
316 |
+
✅ Test test_006 completed successfully (0.130s)
|
317 |
+
|
318 |
+
================================================================================
|
319 |
+
📊 COMPREHENSIVE TEST REPORT
|
320 |
+
================================================================================
|
321 |
+
🕐 Execution Summary:
|
322 |
+
Start time: 2025-07-31 06:38:22
|
323 |
+
End time: 2025-07-31 06:40:01
|
324 |
+
Total duration: 98.954s
|
325 |
+
Average per test: 16.492s
|
326 |
+
|
327 |
+
📈 Test Results:
|
328 |
+
Total tests: 6
|
329 |
+
Successful: 6 ✅
|
330 |
+
Failed: 0 ❌
|
331 |
+
Success rate: 100.0%
|
332 |
+
|
333 |
+
✅ Successful Tests Analysis:
|
334 |
+
Condition extraction sources:
|
335 |
+
- predefined_mapping: 3 tests
|
336 |
+
- generic_search: 3 tests
|
337 |
+
Performance metrics:
|
338 |
+
- Avg condition extraction: 14.935s
|
339 |
+
- Avg retrieval time: 0.206s
|
340 |
+
|
341 |
+
📋 test_001: Classic acute myocardial infarction query
|
342 |
+
Query: 'how to treat acute MI?'
|
343 |
+
Condition: acute myocardial infarction
|
344 |
+
Source: predefined_mapping
|
345 |
+
Results: 9 total (4 emergency, 5 treatment)
|
346 |
+
Duration: 42.511s
|
347 |
+
|
348 |
+
📋 test_002: Symptoms-based query requiring LLM analysis
|
349 |
+
Query: 'patient with severe chest pain and shortness of breath'
|
350 |
+
Condition: generic medical query
|
351 |
+
Source: generic_search
|
352 |
+
Results: 9 total (5 emergency, 4 treatment)
|
353 |
+
Duration: 20.176s
|
354 |
+
|
355 |
+
📋 test_003: Neurological emergency query
|
356 |
+
Query: 'sudden neurological symptoms suggesting stroke'
|
357 |
+
Condition: generic medical query
|
358 |
+
Source: generic_search
|
359 |
+
Results: 9 total (5 emergency, 4 treatment)
|
360 |
+
Duration: 17.569s
|
361 |
+
|
362 |
+
📋 test_004: Protocol-specific stroke query
|
363 |
+
Query: 'acute stroke management protocol'
|
364 |
+
Condition: acute stroke
|
365 |
+
Source: predefined_mapping
|
366 |
+
Results: 9 total (5 emergency, 4 treatment)
|
367 |
+
Duration: 0.137s
|
368 |
+
|
369 |
+
📋 test_005: General symptom requiring LLM analysis
|
370 |
+
Query: 'patient presenting with acute abdominal pain'
|
371 |
+
Condition: generic medical query
|
372 |
+
Source: generic_search
|
373 |
+
Results: 9 total (5 emergency, 4 treatment)
|
374 |
+
Duration: 10.322s
|
375 |
+
|
376 |
+
📋 test_006: Specific condition with treatment focus
|
377 |
+
Query: 'pulmonary embolism treatment guidelines'
|
378 |
+
Condition: pulmonary embolism
|
379 |
+
Source: predefined_mapping
|
380 |
+
Results: 8 total (5 emergency, 3 treatment)
|
381 |
+
Duration: 0.130s
|
tests/test_chunk_quality_analysis.py
ADDED
@@ -0,0 +1,333 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Chunk Quality Analysis Tests
|
3 |
+
|
4 |
+
This module analyzes chunk quality and identifies issues with chunk length differences
|
5 |
+
between emergency and treatment data processing methods.
|
6 |
+
|
7 |
+
Author: OnCall.ai Team
|
8 |
+
Date: 2025-07-28
|
9 |
+
"""
|
10 |
+
|
11 |
+
import sys
|
12 |
+
import json
|
13 |
+
import numpy as np
|
14 |
+
from pathlib import Path
|
15 |
+
from typing import List, Dict, Tuple
|
16 |
+
import logging
|
17 |
+
|
18 |
+
# Setup logging
|
19 |
+
logging.basicConfig(
|
20 |
+
level=logging.INFO,
|
21 |
+
format='%(levelname)s:%(name)s:%(message)s'
|
22 |
+
)
|
23 |
+
logger = logging.getLogger(__name__)
|
24 |
+
|
25 |
+
# Add src to python path
|
26 |
+
current_dir = Path(__file__).parent.resolve()
|
27 |
+
project_root = current_dir.parent
|
28 |
+
sys.path.append(str(project_root / "src"))
|
29 |
+
|
30 |
+
from data_processing import DataProcessor #type: ignore
|
31 |
+
|
32 |
+
class TestChunkQualityAnalysis:
|
33 |
+
|
34 |
+
def setup_class(self):
|
35 |
+
"""Initialize test environment"""
|
36 |
+
print("\n=== Phase 1: Setting up Chunk Quality Analysis ===")
|
37 |
+
self.base_dir = Path(__file__).parent.parent.resolve()
|
38 |
+
self.models_dir = self.base_dir / "models"
|
39 |
+
self.embeddings_dir = self.models_dir / "embeddings"
|
40 |
+
|
41 |
+
print(f"• Base directory: {self.base_dir}")
|
42 |
+
print(f"• Models directory: {self.models_dir}")
|
43 |
+
|
44 |
+
# Initialize processor
|
45 |
+
self.processor = DataProcessor(base_dir=str(self.base_dir))
|
46 |
+
print("• DataProcessor initialized")
|
47 |
+
|
48 |
+
def test_chunk_length_analysis(self):
|
49 |
+
"""Detailed analysis of chunk length distribution"""
|
50 |
+
print("\n=== Phase 2: Chunk Length Distribution Analysis ===")
|
51 |
+
|
52 |
+
try:
|
53 |
+
# Load chunk data
|
54 |
+
print("• Loading chunk data...")
|
55 |
+
with open(self.embeddings_dir / "emergency_chunks.json", 'r') as f:
|
56 |
+
emergency_chunks = json.load(f)
|
57 |
+
with open(self.embeddings_dir / "treatment_chunks.json", 'r') as f:
|
58 |
+
treatment_chunks = json.load(f)
|
59 |
+
|
60 |
+
# Analyze emergency chunks
|
61 |
+
em_lengths = [len(chunk['text']) for chunk in emergency_chunks]
|
62 |
+
em_token_counts = [chunk.get('token_count', 0) for chunk in emergency_chunks]
|
63 |
+
|
64 |
+
print(f"\n📊 Emergency Chunks Analysis:")
|
65 |
+
print(f"• Total chunks: {len(em_lengths):,}")
|
66 |
+
print(f"• Min length: {min(em_lengths)} chars")
|
67 |
+
print(f"• Max length: {max(em_lengths)} chars")
|
68 |
+
print(f"• Average length: {sum(em_lengths)/len(em_lengths):.2f} chars")
|
69 |
+
print(f"• Median length: {sorted(em_lengths)[len(em_lengths)//2]} chars")
|
70 |
+
|
71 |
+
if any(em_token_counts):
|
72 |
+
avg_tokens = sum(em_token_counts)/len(em_token_counts)
|
73 |
+
print(f"• Average tokens: {avg_tokens:.2f}")
|
74 |
+
print(f"• Chars per token ratio: {(sum(em_lengths)/len(em_lengths)) / avg_tokens:.2f}")
|
75 |
+
|
76 |
+
# Analyze treatment chunks
|
77 |
+
tr_lengths = [len(chunk['text']) for chunk in treatment_chunks]
|
78 |
+
|
79 |
+
print(f"\n📊 Treatment Chunks Analysis:")
|
80 |
+
print(f"• Total chunks: {len(tr_lengths):,}")
|
81 |
+
print(f"• Min length: {min(tr_lengths)} chars")
|
82 |
+
print(f"• Max length: {max(tr_lengths)} chars")
|
83 |
+
print(f"• Average length: {sum(tr_lengths)/len(tr_lengths):.2f} chars")
|
84 |
+
print(f"• Median length: {sorted(tr_lengths)[len(tr_lengths)//2]} chars")
|
85 |
+
|
86 |
+
# Length distribution comparison
|
87 |
+
em_avg = sum(em_lengths)/len(em_lengths)
|
88 |
+
tr_avg = sum(tr_lengths)/len(tr_lengths)
|
89 |
+
ratio = em_avg / tr_avg
|
90 |
+
|
91 |
+
print(f"\n🔍 Length Distribution Comparison:")
|
92 |
+
print(f"• Emergency average: {em_avg:.0f} chars")
|
93 |
+
print(f"• Treatment average: {tr_avg:.0f} chars")
|
94 |
+
print(f"• Ratio (Emergency/Treatment): {ratio:.1f}x")
|
95 |
+
|
96 |
+
# Length distribution buckets
|
97 |
+
print(f"\n📈 Length Distribution Buckets:")
|
98 |
+
buckets = [0, 100, 250, 500, 1000, 2000, 5000]
|
99 |
+
|
100 |
+
for i in range(len(buckets)-1):
|
101 |
+
em_count = sum(1 for l in em_lengths if buckets[i] <= l < buckets[i+1])
|
102 |
+
tr_count = sum(1 for l in tr_lengths if buckets[i] <= l < buckets[i+1])
|
103 |
+
print(f"• {buckets[i]}-{buckets[i+1]} chars: Emergency={em_count}, Treatment={tr_count}")
|
104 |
+
|
105 |
+
# Flag potential issues
|
106 |
+
if ratio > 5.0:
|
107 |
+
print(f"\n⚠️ WARNING: Emergency chunks are {ratio:.1f}x longer than treatment chunks!")
|
108 |
+
print(" This suggests different chunking strategies are being used.")
|
109 |
+
|
110 |
+
print("✅ Chunk length analysis completed")
|
111 |
+
|
112 |
+
except Exception as e:
|
113 |
+
print(f"❌ Error in chunk length analysis: {str(e)}")
|
114 |
+
raise
|
115 |
+
|
116 |
+
def test_chunking_method_comparison(self):
|
117 |
+
"""Compare the two chunking methods on the same data"""
|
118 |
+
print("\n=== Phase 3: Chunking Method Comparison ===")
|
119 |
+
|
120 |
+
try:
|
121 |
+
# Load data
|
122 |
+
print("• Loading dataset for comparison...")
|
123 |
+
self.processor.load_filtered_data()
|
124 |
+
|
125 |
+
# Test on multiple samples for better analysis
|
126 |
+
sample_size = 5
|
127 |
+
samples = self.processor.treatment_data.head(sample_size)
|
128 |
+
|
129 |
+
method1_results = [] # keyword_centered_chunks
|
130 |
+
method2_results = [] # dual_keyword_chunks
|
131 |
+
|
132 |
+
print(f"• Testing {sample_size} samples with both methods...")
|
133 |
+
|
134 |
+
for idx, row in samples.iterrows():
|
135 |
+
if not row.get('clean_text') or not row.get('treatment_matched'):
|
136 |
+
continue
|
137 |
+
|
138 |
+
text_length = len(row['clean_text'])
|
139 |
+
emergency_kw = row.get('matched', '')
|
140 |
+
treatment_kw = row['treatment_matched']
|
141 |
+
|
142 |
+
# Method 1: keyword_centered_chunks (Emergency method)
|
143 |
+
chunks1 = self.processor.create_keyword_centered_chunks(
|
144 |
+
text=row['clean_text'],
|
145 |
+
matched_keywords=emergency_kw,
|
146 |
+
chunk_size=256,
|
147 |
+
doc_id=f"test_{idx}"
|
148 |
+
)
|
149 |
+
|
150 |
+
# Method 2: dual_keyword_chunks (Treatment method)
|
151 |
+
chunks2 = self.processor.create_dual_keyword_chunks(
|
152 |
+
text=row['clean_text'],
|
153 |
+
emergency_keywords=emergency_kw,
|
154 |
+
treatment_keywords=treatment_kw,
|
155 |
+
chunk_size=256,
|
156 |
+
doc_id=f"test_{idx}"
|
157 |
+
)
|
158 |
+
|
159 |
+
# Collect results
|
160 |
+
if chunks1:
|
161 |
+
avg_len1 = sum(len(c['text']) for c in chunks1) / len(chunks1)
|
162 |
+
method1_results.append({
|
163 |
+
'doc_id': idx,
|
164 |
+
'chunks_count': len(chunks1),
|
165 |
+
'avg_length': avg_len1,
|
166 |
+
'text_length': text_length
|
167 |
+
})
|
168 |
+
|
169 |
+
if chunks2:
|
170 |
+
avg_len2 = sum(len(c['text']) for c in chunks2) / len(chunks2)
|
171 |
+
method2_results.append({
|
172 |
+
'doc_id': idx,
|
173 |
+
'chunks_count': len(chunks2),
|
174 |
+
'avg_length': avg_len2,
|
175 |
+
'text_length': text_length
|
176 |
+
})
|
177 |
+
|
178 |
+
# Analysis results
|
179 |
+
print(f"\n📊 Method Comparison Results:")
|
180 |
+
|
181 |
+
if method1_results:
|
182 |
+
avg_chunks1 = sum(r['chunks_count'] for r in method1_results) / len(method1_results)
|
183 |
+
avg_len1 = sum(r['avg_length'] for r in method1_results) / len(method1_results)
|
184 |
+
print(f"\n🔹 Keyword-Centered Method (Emergency):")
|
185 |
+
print(f"• Average chunks per document: {avg_chunks1:.1f}")
|
186 |
+
print(f"• Average chunk length: {avg_len1:.0f} chars")
|
187 |
+
|
188 |
+
if method2_results:
|
189 |
+
avg_chunks2 = sum(r['chunks_count'] for r in method2_results) / len(method2_results)
|
190 |
+
avg_len2 = sum(r['avg_length'] for r in method2_results) / len(method2_results)
|
191 |
+
print(f"\n🔹 Dual-Keyword Method (Treatment):")
|
192 |
+
print(f"• Average chunks per document: {avg_chunks2:.1f}")
|
193 |
+
print(f"• Average chunk length: {avg_len2:.0f} chars")
|
194 |
+
|
195 |
+
if method1_results:
|
196 |
+
ratio = avg_len1 / avg_len2
|
197 |
+
print(f"\n🔍 Length Ratio: {ratio:.1f}x (Method1 / Method2)")
|
198 |
+
|
199 |
+
print("✅ Chunking method comparison completed")
|
200 |
+
|
201 |
+
except Exception as e:
|
202 |
+
print(f"❌ Error in method comparison: {str(e)}")
|
203 |
+
raise
|
204 |
+
|
205 |
+
def test_token_vs_character_analysis(self):
|
206 |
+
"""Analyze token vs character differences in chunking"""
|
207 |
+
print("\n=== Phase 4: Token vs Character Analysis ===")
|
208 |
+
|
209 |
+
try:
|
210 |
+
# Load model for tokenization
|
211 |
+
print("• Loading embedding model for tokenization...")
|
212 |
+
self.processor.load_embedding_model()
|
213 |
+
|
214 |
+
# Test sample texts
|
215 |
+
test_texts = [
|
216 |
+
"Patient presents with acute chest pain and shortness of breath.",
|
217 |
+
"Emergency treatment for myocardial infarction includes immediate medication.",
|
218 |
+
"The patient's vital signs show tachycardia and hypotension requiring intervention."
|
219 |
+
]
|
220 |
+
|
221 |
+
print(f"\n📊 Token vs Character Analysis:")
|
222 |
+
|
223 |
+
total_chars = 0
|
224 |
+
total_tokens = 0
|
225 |
+
|
226 |
+
for i, text in enumerate(test_texts, 1):
|
227 |
+
char_count = len(text)
|
228 |
+
token_count = len(self.processor.tokenizer.tokenize(text))
|
229 |
+
ratio = char_count / token_count if token_count > 0 else 0
|
230 |
+
|
231 |
+
print(f"\nSample {i}:")
|
232 |
+
print(f"• Text: {text[:50]}...")
|
233 |
+
print(f"• Characters: {char_count}")
|
234 |
+
print(f"• Tokens: {token_count}")
|
235 |
+
print(f"• Chars/Token ratio: {ratio:.2f}")
|
236 |
+
|
237 |
+
total_chars += char_count
|
238 |
+
total_tokens += token_count
|
239 |
+
|
240 |
+
overall_ratio = total_chars / total_tokens
|
241 |
+
print(f"\n🔍 Overall Character/Token Ratio: {overall_ratio:.2f}")
|
242 |
+
|
243 |
+
# Estimate chunk sizes
|
244 |
+
target_tokens = 256
|
245 |
+
estimated_chars = target_tokens * overall_ratio
|
246 |
+
|
247 |
+
print(f"\n📏 Chunk Size Estimates:")
|
248 |
+
print(f"• Target tokens: {target_tokens}")
|
249 |
+
print(f"• Estimated characters: {estimated_chars:.0f}")
|
250 |
+
print(f"• Current emergency avg: 1842 chars ({1842/overall_ratio:.0f} estimated tokens)")
|
251 |
+
print(f"• Current treatment avg: 250 chars ({250/overall_ratio:.0f} estimated tokens)")
|
252 |
+
|
253 |
+
# Recommendations
|
254 |
+
print(f"\n💡 Recommendations:")
|
255 |
+
if 1842/overall_ratio > 512:
|
256 |
+
print("⚠️ Emergency chunks may exceed model's 512 token limit!")
|
257 |
+
if 250/overall_ratio < 64:
|
258 |
+
print("⚠️ Treatment chunks may be too short for meaningful context!")
|
259 |
+
|
260 |
+
print("✅ Token vs character analysis completed")
|
261 |
+
|
262 |
+
except Exception as e:
|
263 |
+
print(f"❌ Error in token analysis: {str(e)}")
|
264 |
+
raise
|
265 |
+
|
266 |
+
def test_generate_recommendations(self):
|
267 |
+
"""Generate recommendations based on analysis"""
|
268 |
+
print("\n=== Phase 5: Generating Recommendations ===")
|
269 |
+
|
270 |
+
recommendations = []
|
271 |
+
|
272 |
+
# Based on the known chunk length difference
|
273 |
+
recommendations.append({
|
274 |
+
'issue': 'Inconsistent chunk lengths',
|
275 |
+
'description': 'Emergency chunks (1842 chars) are 7x longer than treatment chunks (250 chars)',
|
276 |
+
'recommendation': 'Standardize both methods to use token-based chunking with consistent parameters',
|
277 |
+
'priority': 'HIGH'
|
278 |
+
})
|
279 |
+
|
280 |
+
recommendations.append({
|
281 |
+
'issue': 'Different chunking strategies',
|
282 |
+
'description': 'Emergency uses keyword-centered (token-based), Treatment uses dual-keyword (character-based)',
|
283 |
+
'recommendation': 'Update dual_keyword_chunks to use tokenizer for consistent token-based chunking',
|
284 |
+
'priority': 'HIGH'
|
285 |
+
})
|
286 |
+
|
287 |
+
recommendations.append({
|
288 |
+
'issue': 'Potential token limit overflow',
|
289 |
+
'description': 'Large chunks may exceed PubMedBERT 512 token limit',
|
290 |
+
'recommendation': 'Implement strict token-based chunking with overlap to prevent overflow',
|
291 |
+
'priority': 'MEDIUM'
|
292 |
+
})
|
293 |
+
|
294 |
+
print(f"\n📋 Analysis Recommendations:")
|
295 |
+
for i, rec in enumerate(recommendations, 1):
|
296 |
+
print(f"\n{i}. {rec['issue']} [{rec['priority']}]")
|
297 |
+
print(f" Problem: {rec['description']}")
|
298 |
+
print(f" Solution: {rec['recommendation']}")
|
299 |
+
|
300 |
+
print("\n✅ Recommendations generated")
|
301 |
+
return recommendations
|
302 |
+
|
303 |
+
def main():
|
304 |
+
"""Run all chunk quality analysis tests"""
|
305 |
+
print("\n" + "="*60)
|
306 |
+
print("CHUNK QUALITY ANALYSIS TEST SUITE")
|
307 |
+
print("="*60)
|
308 |
+
|
309 |
+
test = TestChunkQualityAnalysis()
|
310 |
+
test.setup_class()
|
311 |
+
|
312 |
+
try:
|
313 |
+
test.test_chunk_length_analysis()
|
314 |
+
test.test_chunking_method_comparison()
|
315 |
+
test.test_token_vs_character_analysis()
|
316 |
+
recommendations = test.test_generate_recommendations()
|
317 |
+
|
318 |
+
print("\n" + "="*60)
|
319 |
+
print("🎉 ALL CHUNK QUALITY TESTS COMPLETED SUCCESSFULLY!")
|
320 |
+
print("="*60)
|
321 |
+
print(f"\nKey Finding: Chunk length inconsistency detected!")
|
322 |
+
print(f"Emergency: ~1842 chars, Treatment: ~250 chars (7x difference)")
|
323 |
+
print(f"Recommendation: Standardize to token-based chunking")
|
324 |
+
print("="*60)
|
325 |
+
|
326 |
+
except Exception as e:
|
327 |
+
print("\n" + "="*60)
|
328 |
+
print("❌ CHUNK QUALITY TESTS FAILED!")
|
329 |
+
print(f"Error: {str(e)}")
|
330 |
+
print("="*60)
|
331 |
+
|
332 |
+
if __name__ == "__main__":
|
333 |
+
main()
|
tests/test_data_processing.py
CHANGED
@@ -12,7 +12,7 @@ import pandas as pd
|
|
12 |
# Add src to path
|
13 |
sys.path.append(str(Path(__file__).parent.parent.resolve() / "src"))
|
14 |
|
15 |
-
from data_processing import DataProcessor
|
16 |
import logging
|
17 |
|
18 |
# Setup logging
|
@@ -80,7 +80,7 @@ def test_chunking():
|
|
80 |
chunks = processor.create_keyword_centered_chunks(
|
81 |
text=row['clean_text'],
|
82 |
matched_keywords=row['matched'],
|
83 |
-
chunk_size=
|
84 |
doc_id=str(row.get('id', idx))
|
85 |
)
|
86 |
emergency_chunks.extend(chunks)
|
@@ -97,7 +97,7 @@ def test_chunking():
|
|
97 |
text=row['clean_text'],
|
98 |
emergency_keywords=row.get('matched', ''),
|
99 |
treatment_keywords=row['treatment_matched'],
|
100 |
-
chunk_size=
|
101 |
doc_id=str(row.get('id', idx))
|
102 |
)
|
103 |
treatment_chunks.extend(chunks)
|
@@ -116,7 +116,7 @@ def test_chunking():
|
|
116 |
sample_chunk = treatment_chunks[0]
|
117 |
print(f"\nSample treatment chunk:")
|
118 |
print(f" Primary keyword: {sample_chunk['primary_keyword']}")
|
119 |
-
print(f" Emergency keywords: {sample_chunk
|
120 |
print(f" Text length: {len(sample_chunk['text'])}")
|
121 |
print(f" Text preview: {sample_chunk['text'][:100]}...")
|
122 |
|
@@ -186,18 +186,109 @@ def test_token_chunking():
|
|
186 |
print(f"❌ Token chunking test failed: {e}")
|
187 |
return False
|
188 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
189 |
def main():
|
190 |
"""Run all tests"""
|
191 |
print("Starting data processing tests...\n")
|
192 |
|
193 |
-
# Import pandas here since it's used in chunking test
|
194 |
-
import pandas as pd
|
195 |
-
|
196 |
tests = [
|
197 |
test_data_loading,
|
198 |
test_chunking,
|
199 |
test_model_loading,
|
200 |
-
test_token_chunking
|
|
|
201 |
]
|
202 |
|
203 |
results = []
|
|
|
12 |
# Add src to path
|
13 |
sys.path.append(str(Path(__file__).parent.parent.resolve() / "src"))
|
14 |
|
15 |
+
from data_processing import DataProcessor #type: ignore
|
16 |
import logging
|
17 |
|
18 |
# Setup logging
|
|
|
80 |
chunks = processor.create_keyword_centered_chunks(
|
81 |
text=row['clean_text'],
|
82 |
matched_keywords=row['matched'],
|
83 |
+
chunk_size=256, # Updated to use 256 tokens
|
84 |
doc_id=str(row.get('id', idx))
|
85 |
)
|
86 |
emergency_chunks.extend(chunks)
|
|
|
97 |
text=row['clean_text'],
|
98 |
emergency_keywords=row.get('matched', ''),
|
99 |
treatment_keywords=row['treatment_matched'],
|
100 |
+
chunk_size=256, # Updated to use 256 tokens
|
101 |
doc_id=str(row.get('id', idx))
|
102 |
)
|
103 |
treatment_chunks.extend(chunks)
|
|
|
116 |
sample_chunk = treatment_chunks[0]
|
117 |
print(f"\nSample treatment chunk:")
|
118 |
print(f" Primary keyword: {sample_chunk['primary_keyword']}")
|
119 |
+
print(f" Emergency keywords: {sample_chunk.get('emergency_keywords', '')}")
|
120 |
print(f" Text length: {len(sample_chunk['text'])}")
|
121 |
print(f" Text preview: {sample_chunk['text'][:100]}...")
|
122 |
|
|
|
186 |
print(f"❌ Token chunking test failed: {e}")
|
187 |
return False
|
188 |
|
189 |
+
def test_dual_keyword_chunks():
|
190 |
+
"""Test the enhanced dual keyword chunking functionality with token-based approach"""
|
191 |
+
print("\n" + "="*50)
|
192 |
+
print("TESTING DUAL KEYWORD CHUNKING")
|
193 |
+
print("="*50)
|
194 |
+
|
195 |
+
try:
|
196 |
+
processor = DataProcessor()
|
197 |
+
processor.load_embedding_model() # Need tokenizer for token count verification
|
198 |
+
|
199 |
+
# Test case 1: Both emergency and treatment keywords
|
200 |
+
print("\nTest Case 1: Both Keywords")
|
201 |
+
text = "Patient with acute MI requires immediate IV treatment. Additional chest pain symptoms require aspirin administration."
|
202 |
+
emergency_kws = "MI|chest pain"
|
203 |
+
treatment_kws = "IV|aspirin"
|
204 |
+
|
205 |
+
chunks = processor.create_dual_keyword_chunks(
|
206 |
+
text=text,
|
207 |
+
emergency_keywords=emergency_kws,
|
208 |
+
treatment_keywords=treatment_kws,
|
209 |
+
chunk_size=256
|
210 |
+
)
|
211 |
+
|
212 |
+
# Verify chunk properties
|
213 |
+
for i, chunk in enumerate(chunks):
|
214 |
+
print(f"\nChunk {i+1}:")
|
215 |
+
# Verify source type
|
216 |
+
source_type = chunk.get('source_type')
|
217 |
+
assert source_type in ['emergency', 'treatment'], f"Invalid source_type: {source_type}"
|
218 |
+
print(f"• Source type: {source_type}")
|
219 |
+
|
220 |
+
# Verify metadata for treatment chunks
|
221 |
+
if source_type == 'treatment':
|
222 |
+
contains_em = chunk.get('contains_emergency_kws', [])
|
223 |
+
contains_tr = chunk.get('contains_treatment_kws', [])
|
224 |
+
match_type = chunk.get('match_type')
|
225 |
+
print(f"• Contains Emergency: {contains_em}")
|
226 |
+
print(f"• Contains Treatment: {contains_tr}")
|
227 |
+
print(f"• Match Type: {match_type}")
|
228 |
+
assert match_type in ['both', 'emergency_only', 'treatment_only', 'none'], \
|
229 |
+
f"Invalid match_type: {match_type}"
|
230 |
+
|
231 |
+
# Verify token count
|
232 |
+
tokens = processor.tokenizer.tokenize(chunk['text'])
|
233 |
+
token_count = len(tokens)
|
234 |
+
print(f"• Token count: {token_count}")
|
235 |
+
# Allow for overlap
|
236 |
+
assert token_count <= 384, f"Chunk too large: {token_count} tokens"
|
237 |
+
|
238 |
+
# Print text preview
|
239 |
+
print(f"• Text preview: {chunk['text'][:100]}...")
|
240 |
+
|
241 |
+
# Test case 2: Emergency keywords only
|
242 |
+
print("\nTest Case 2: Emergency Only")
|
243 |
+
text = "Patient presents with severe chest pain and dyspnea."
|
244 |
+
emergency_kws = "chest pain"
|
245 |
+
treatment_kws = ""
|
246 |
+
|
247 |
+
chunks = processor.create_dual_keyword_chunks(
|
248 |
+
text=text,
|
249 |
+
emergency_keywords=emergency_kws,
|
250 |
+
treatment_keywords=treatment_kws,
|
251 |
+
chunk_size=256
|
252 |
+
)
|
253 |
+
|
254 |
+
assert len(chunks) > 0, "No chunks generated for emergency-only case"
|
255 |
+
print(f"✓ Generated {len(chunks)} chunks")
|
256 |
+
|
257 |
+
# Test case 3: Treatment keywords only
|
258 |
+
print("\nTest Case 3: Treatment Only")
|
259 |
+
text = "Administer IV fluids and monitor response."
|
260 |
+
emergency_kws = ""
|
261 |
+
treatment_kws = "IV"
|
262 |
+
|
263 |
+
chunks = processor.create_dual_keyword_chunks(
|
264 |
+
text=text,
|
265 |
+
emergency_keywords=emergency_kws,
|
266 |
+
treatment_keywords=treatment_kws,
|
267 |
+
chunk_size=256
|
268 |
+
)
|
269 |
+
|
270 |
+
assert len(chunks) > 0, "No chunks generated for treatment-only case"
|
271 |
+
print(f"✓ Generated {len(chunks)} chunks")
|
272 |
+
|
273 |
+
print("\n✅ All dual keyword chunking tests passed")
|
274 |
+
return True
|
275 |
+
|
276 |
+
except Exception as e:
|
277 |
+
print(f"\n❌ Dual keyword chunking test failed: {e}")
|
278 |
+
import traceback
|
279 |
+
traceback.print_exc()
|
280 |
+
return False
|
281 |
+
|
282 |
def main():
|
283 |
"""Run all tests"""
|
284 |
print("Starting data processing tests...\n")
|
285 |
|
|
|
|
|
|
|
286 |
tests = [
|
287 |
test_data_loading,
|
288 |
test_chunking,
|
289 |
test_model_loading,
|
290 |
+
test_token_chunking,
|
291 |
+
test_dual_keyword_chunks # Added new test
|
292 |
]
|
293 |
|
294 |
results = []
|
tests/test_embedding_and_index.py
CHANGED
@@ -1,29 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import numpy as np
|
2 |
from annoy import AnnoyIndex
|
3 |
import pytest
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
def
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Basic embedding and index validation tests
|
3 |
+
"""
|
4 |
+
# 2025-07-28
|
5 |
+
import sys
|
6 |
+
from pathlib import Path
|
7 |
+
#
|
8 |
+
|
9 |
import numpy as np
|
10 |
from annoy import AnnoyIndex
|
11 |
import pytest
|
12 |
+
|
13 |
+
print("\n=== Phase 1: Initializing Test Environment ===")
|
14 |
+
# add src to python path
|
15 |
+
current_dir = Path(__file__).parent.resolve()
|
16 |
+
project_root = current_dir.parent
|
17 |
+
sys.path.append(str(project_root / "src"))
|
18 |
+
|
19 |
+
print(f"• Current directory: {current_dir}")
|
20 |
+
print(f"• Project root: {project_root}")
|
21 |
+
print(f"• Python path: {sys.path}")
|
22 |
+
|
23 |
+
from data_processing import DataProcessor #type: ignore
|
24 |
+
|
25 |
+
|
26 |
+
class TestEmbeddingAndIndex:
|
27 |
+
def setup_class(self):
|
28 |
+
"""初始化測試類"""
|
29 |
+
print("\n=== Phase 2: Setting up TestEmbeddingAndIndex ===")
|
30 |
+
self.base_dir = Path(__file__).parent.parent.resolve()
|
31 |
+
print(f"• Base directory: {self.base_dir}")
|
32 |
+
self.processor = DataProcessor(base_dir=str(self.base_dir))
|
33 |
+
print("• DataProcessor initialized")
|
34 |
+
|
35 |
+
def test_embedding_dimensions(self):
|
36 |
+
print("\n=== Phase 3: Testing Embedding Dimensions ===")
|
37 |
+
print("• Loading emergency embeddings...")
|
38 |
+
# load emergency embeddings
|
39 |
+
emb = np.load(self.processor.models_dir / "embeddings" / "emergency_embeddings.npy")
|
40 |
+
expected_dim = self.processor.embedding_dim
|
41 |
+
|
42 |
+
print(f"• Loaded embedding shape: {emb.shape}")
|
43 |
+
print(f"• Expected dimension: {expected_dim}")
|
44 |
+
|
45 |
+
assert emb.ndim == 2, f"Expected 2D array, got {emb.ndim}D"
|
46 |
+
assert emb.shape[1] == expected_dim, (
|
47 |
+
f"Expected embedding dimension {expected_dim}, got {emb.shape[1]}"
|
48 |
+
)
|
49 |
+
print("✅ Embedding dimensions test passed")
|
50 |
+
|
51 |
+
def test_annoy_search(self):
|
52 |
+
print("\n=== Phase 4: Testing Annoy Search ===")
|
53 |
+
print("• Loading embeddings...")
|
54 |
+
# load embeddings
|
55 |
+
emb = np.load(self.processor.models_dir / "embeddings" / "emergency_embeddings.npy")
|
56 |
+
print(f"• Loaded embeddings shape: {emb.shape}")
|
57 |
+
|
58 |
+
print("• Loading Annoy index...")
|
59 |
+
# load Annoy index
|
60 |
+
idx = AnnoyIndex(self.processor.embedding_dim, 'angular')
|
61 |
+
index_path = self.processor.models_dir / "indices" / "annoy" / "emergency_index.ann"
|
62 |
+
print(f"• Index path: {index_path}")
|
63 |
+
idx.load(str(index_path))
|
64 |
+
|
65 |
+
print("• Performing sample query...")
|
66 |
+
# perform a sample query
|
67 |
+
query_vec = emb[0]
|
68 |
+
ids, distances = idx.get_nns_by_vector(query_vec, 5, include_distances=True)
|
69 |
+
|
70 |
+
print(f"• Search results:")
|
71 |
+
print(f" - Found IDs: {ids}")
|
72 |
+
print(f" - Distances: {[f'{d:.4f}' for d in distances]}")
|
73 |
+
|
74 |
+
assert len(ids) == 5, f"Expected 5 results, got {len(ids)}"
|
75 |
+
assert all(0 <= d <= 2 for d in distances), "Invalid distance values"
|
76 |
+
print("✅ Annoy search test passed")
|
77 |
+
|
78 |
+
def main():
|
79 |
+
"""Run tests manually"""
|
80 |
+
print("\n" + "="*50)
|
81 |
+
print("Starting Embedding and Index Tests")
|
82 |
+
print("="*50)
|
83 |
+
|
84 |
+
test = TestEmbeddingAndIndex()
|
85 |
+
test.setup_class() # 手動初始化
|
86 |
+
|
87 |
+
try:
|
88 |
+
test.test_embedding_dimensions()
|
89 |
+
test.test_annoy_search()
|
90 |
+
print("\n" + "="*50)
|
91 |
+
print("🎉 All tests completed successfully!")
|
92 |
+
print("="*50)
|
93 |
+
|
94 |
+
except Exception as e:
|
95 |
+
print("\n" + "="*50)
|
96 |
+
print("❌ Tests failed!")
|
97 |
+
print(f"Error: {str(e)}")
|
98 |
+
print("="*50)
|
99 |
+
|
100 |
+
if __name__ == "__main__":
|
101 |
+
main()
|
tests/test_embedding_validation.py
CHANGED
@@ -7,14 +7,27 @@ import numpy as np
|
|
7 |
import json
|
8 |
import logging
|
9 |
import os
|
|
|
10 |
from pathlib import Path
|
11 |
from typing import Tuple, List, Optional
|
12 |
from annoy import AnnoyIndex
|
13 |
from sentence_transformers import SentenceTransformer
|
14 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
class TestEmbeddingValidation:
|
16 |
def setup_class(self):
|
17 |
"""Initialize test environment with necessary data and models."""
|
|
|
|
|
18 |
# Setup logging
|
19 |
logging.basicConfig(
|
20 |
level=logging.DEBUG,
|
@@ -24,43 +37,57 @@ class TestEmbeddingValidation:
|
|
24 |
self.logger = logging.getLogger(__name__)
|
25 |
|
26 |
# Define base paths
|
27 |
-
self.project_root = Path(
|
28 |
self.models_dir = self.project_root / "models"
|
29 |
self.embeddings_dir = self.models_dir / "embeddings"
|
30 |
self.indices_dir = self.models_dir / "indices" / "annoy"
|
31 |
|
|
|
|
|
|
|
|
|
32 |
self.logger.info(f"Project root: {self.project_root}")
|
33 |
self.logger.info(f"Models directory: {self.models_dir}")
|
34 |
self.logger.info(f"Embeddings directory: {self.embeddings_dir}")
|
35 |
|
36 |
try:
|
37 |
# Check directory existence
|
|
|
38 |
if not self.embeddings_dir.exists():
|
39 |
raise FileNotFoundError(f"Embeddings directory not found at: {self.embeddings_dir}")
|
40 |
if not self.indices_dir.exists():
|
41 |
raise FileNotFoundError(f"Indices directory not found at: {self.indices_dir}")
|
42 |
|
43 |
# Load embeddings
|
|
|
44 |
self.emergency_emb = np.load(self.embeddings_dir / "emergency_embeddings.npy")
|
45 |
self.treatment_emb = np.load(self.embeddings_dir / "treatment_embeddings.npy")
|
46 |
|
47 |
# Load chunks
|
|
|
48 |
with open(self.embeddings_dir / "emergency_chunks.json", 'r') as f:
|
49 |
self.emergency_chunks = json.load(f)
|
50 |
with open(self.embeddings_dir / "treatment_chunks.json", 'r') as f:
|
51 |
self.treatment_chunks = json.load(f)
|
52 |
|
53 |
# Initialize model
|
|
|
54 |
self.model = SentenceTransformer("NeuML/pubmedbert-base-embeddings")
|
55 |
|
|
|
|
|
|
|
|
|
56 |
self.logger.info("Test environment initialized successfully")
|
57 |
self.logger.info(f"Emergency embeddings shape: {self.emergency_emb.shape}")
|
58 |
self.logger.info(f"Treatment embeddings shape: {self.treatment_emb.shape}")
|
59 |
|
60 |
except FileNotFoundError as e:
|
|
|
61 |
self.logger.error(f"File not found: {e}")
|
62 |
raise
|
63 |
except Exception as e:
|
|
|
64 |
self.logger.error(f"Error during initialization: {e}")
|
65 |
raise
|
66 |
|
@@ -84,20 +111,28 @@ class TestEmbeddingValidation:
|
|
84 |
|
85 |
def test_embedding_dimensions(self):
|
86 |
"""Test embedding dimensions and data quality."""
|
|
|
87 |
self.logger.info("\n=== Embedding Validation Report ===")
|
88 |
|
89 |
try:
|
90 |
# Basic dimension checks
|
|
|
91 |
assert self.emergency_emb.shape[1] == 768, "Emergency embedding dimension should be 768"
|
92 |
assert self.treatment_emb.shape[1] == 768, "Treatment embedding dimension should be 768"
|
|
|
|
|
93 |
|
94 |
# Count verification
|
|
|
95 |
assert len(self.emergency_chunks) == self.emergency_emb.shape[0], \
|
96 |
"Emergency chunks count mismatch"
|
97 |
assert len(self.treatment_chunks) == self.treatment_emb.shape[0], \
|
98 |
"Treatment chunks count mismatch"
|
|
|
|
|
99 |
|
100 |
# Data quality checks
|
|
|
101 |
for name, emb in [("Emergency", self.emergency_emb),
|
102 |
("Treatment", self.treatment_emb)]:
|
103 |
# Check for NaN and Inf
|
@@ -105,25 +140,35 @@ class TestEmbeddingValidation:
|
|
105 |
assert not np.isinf(emb).any(), f"{name} contains Inf values"
|
106 |
|
107 |
# Value distribution analysis
|
|
|
|
|
|
|
|
|
|
|
108 |
self.logger.info(f"\n{name} Embeddings Statistics:")
|
109 |
self.logger.info(f"- Range: {np.min(emb):.3f} to {np.max(emb):.3f}")
|
110 |
self.logger.info(f"- Mean: {np.mean(emb):.3f}")
|
111 |
self.logger.info(f"- Std: {np.std(emb):.3f}")
|
112 |
|
|
|
113 |
self.logger.info("\n✅ All embedding validations passed")
|
114 |
|
115 |
except AssertionError as e:
|
|
|
116 |
self.logger.error(f"Validation failed: {str(e)}")
|
117 |
raise
|
118 |
|
119 |
def test_multiple_known_item_search(self):
|
120 |
"""Test ANNOY search with multiple random samples."""
|
|
|
121 |
self.logger.info("\n=== Multiple Known-Item Search Test ===")
|
122 |
|
|
|
123 |
emergency_index = AnnoyIndex(768, 'angular')
|
124 |
emergency_index.load(str(self.indices_dir / "emergency_index.ann"))
|
125 |
|
126 |
# Test 20 random samples
|
|
|
127 |
test_indices = np.random.choice(
|
128 |
self.emergency_emb.shape[0],
|
129 |
size=20,
|
@@ -131,36 +176,45 @@ class TestEmbeddingValidation:
|
|
131 |
)
|
132 |
|
133 |
success_count = 0
|
134 |
-
for
|
|
|
135 |
try:
|
136 |
test_emb = self.emergency_emb[test_idx]
|
137 |
indices, distances = self._safe_search(emergency_index, test_emb)
|
138 |
|
139 |
if indices is None:
|
|
|
140 |
continue
|
141 |
|
142 |
# Verify self-retrieval
|
143 |
assert indices[0] == test_idx, f"Self-retrieval failed for index {test_idx}"
|
144 |
assert distances[0] < 0.0001, f"Self-distance too large for index {test_idx}"
|
145 |
success_count += 1
|
|
|
146 |
|
147 |
except AssertionError as e:
|
|
|
148 |
self.logger.warning(f"Test failed for index {test_idx}: {str(e)}")
|
149 |
|
|
|
150 |
self.logger.info(f"\n✅ {success_count}/20 self-retrieval tests passed")
|
151 |
assert success_count >= 18, "Less than 90% of self-retrieval tests passed"
|
|
|
152 |
|
153 |
def test_balanced_cross_dataset_search(self):
|
154 |
"""Test search across both emergency and treatment datasets."""
|
|
|
155 |
self.logger.info("\n=== Balanced Cross-Dataset Search Test ===")
|
156 |
|
157 |
# Initialize indices
|
|
|
158 |
emergency_index = AnnoyIndex(768, 'angular')
|
159 |
treatment_index = AnnoyIndex(768, 'angular')
|
160 |
|
161 |
try:
|
162 |
emergency_index.load(str(self.indices_dir / "emergency_index.ann"))
|
163 |
treatment_index.load(str(self.indices_dir / "treatment_index.ann"))
|
|
|
164 |
|
165 |
# Test queries
|
166 |
test_queries = [
|
@@ -169,45 +223,75 @@ class TestEmbeddingValidation:
|
|
169 |
"What are the emergency procedures for anaphylactic shock?"
|
170 |
]
|
171 |
|
172 |
-
|
173 |
-
|
|
|
|
|
174 |
|
175 |
# Generate query vector
|
|
|
176 |
query_emb = self.model.encode([query])[0]
|
177 |
|
178 |
# Get top-5 results from each dataset
|
|
|
179 |
e_indices, e_distances = self._safe_search(emergency_index, query_emb, k=5)
|
180 |
t_indices, t_distances = self._safe_search(treatment_index, query_emb, k=5)
|
181 |
|
182 |
if None in [e_indices, e_distances, t_indices, t_distances]:
|
|
|
183 |
self.logger.error("Search failed for one or both datasets")
|
184 |
continue
|
185 |
|
186 |
# Print first sentence of each result
|
187 |
-
print("\
|
188 |
for i, (idx, dist) in enumerate(zip(e_indices, e_distances), 1):
|
189 |
text = self.emergency_chunks[idx]['text']
|
190 |
first_sentence = text.split('.')[0] + '.'
|
191 |
-
print(f"
|
192 |
-
print(first_sentence)
|
193 |
|
194 |
-
print("\
|
195 |
for i, (idx, dist) in enumerate(zip(t_indices, t_distances), 1):
|
196 |
text = self.treatment_chunks[idx]['text']
|
197 |
first_sentence = text.split('.')[0] + '.'
|
198 |
-
print(f"
|
199 |
-
|
|
|
200 |
|
201 |
except Exception as e:
|
|
|
202 |
self.logger.error(f"Test failed: {str(e)}")
|
203 |
raise
|
204 |
else:
|
|
|
205 |
self.logger.info("\n✅ Cross-dataset search test completed")
|
206 |
|
207 |
-
|
208 |
-
|
|
|
|
|
|
|
|
|
209 |
test = TestEmbeddingValidation()
|
210 |
test.setup_class()
|
211 |
-
|
212 |
-
|
213 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
import json
|
8 |
import logging
|
9 |
import os
|
10 |
+
import sys
|
11 |
from pathlib import Path
|
12 |
from typing import Tuple, List, Optional
|
13 |
from annoy import AnnoyIndex
|
14 |
from sentence_transformers import SentenceTransformer
|
15 |
|
16 |
+
print("\n=== Phase 1: Initializing Test Environment ===")
|
17 |
+
# Add src to python path
|
18 |
+
current_dir = Path(__file__).parent.resolve()
|
19 |
+
project_root = current_dir.parent
|
20 |
+
sys.path.append(str(project_root / "src"))
|
21 |
+
|
22 |
+
print(f"• Current directory: {current_dir}")
|
23 |
+
print(f"• Project root: {project_root}")
|
24 |
+
print(f"• Python path added: {project_root / 'src'}")
|
25 |
+
|
26 |
class TestEmbeddingValidation:
|
27 |
def setup_class(self):
|
28 |
"""Initialize test environment with necessary data and models."""
|
29 |
+
print("\n=== Phase 2: Setting up Test Environment ===")
|
30 |
+
|
31 |
# Setup logging
|
32 |
logging.basicConfig(
|
33 |
level=logging.DEBUG,
|
|
|
37 |
self.logger = logging.getLogger(__name__)
|
38 |
|
39 |
# Define base paths
|
40 |
+
self.project_root = Path(__file__).parent.parent.resolve()
|
41 |
self.models_dir = self.project_root / "models"
|
42 |
self.embeddings_dir = self.models_dir / "embeddings"
|
43 |
self.indices_dir = self.models_dir / "indices" / "annoy"
|
44 |
|
45 |
+
print(f"• Project root: {self.project_root}")
|
46 |
+
print(f"• Models directory: {self.models_dir}")
|
47 |
+
print(f"• Embeddings directory: {self.embeddings_dir}")
|
48 |
+
|
49 |
self.logger.info(f"Project root: {self.project_root}")
|
50 |
self.logger.info(f"Models directory: {self.models_dir}")
|
51 |
self.logger.info(f"Embeddings directory: {self.embeddings_dir}")
|
52 |
|
53 |
try:
|
54 |
# Check directory existence
|
55 |
+
print("• Checking directory existence...")
|
56 |
if not self.embeddings_dir.exists():
|
57 |
raise FileNotFoundError(f"Embeddings directory not found at: {self.embeddings_dir}")
|
58 |
if not self.indices_dir.exists():
|
59 |
raise FileNotFoundError(f"Indices directory not found at: {self.indices_dir}")
|
60 |
|
61 |
# Load embeddings
|
62 |
+
print("• Loading embeddings...")
|
63 |
self.emergency_emb = np.load(self.embeddings_dir / "emergency_embeddings.npy")
|
64 |
self.treatment_emb = np.load(self.embeddings_dir / "treatment_embeddings.npy")
|
65 |
|
66 |
# Load chunks
|
67 |
+
print("• Loading chunk metadata...")
|
68 |
with open(self.embeddings_dir / "emergency_chunks.json", 'r') as f:
|
69 |
self.emergency_chunks = json.load(f)
|
70 |
with open(self.embeddings_dir / "treatment_chunks.json", 'r') as f:
|
71 |
self.treatment_chunks = json.load(f)
|
72 |
|
73 |
# Initialize model
|
74 |
+
print("• Loading PubMedBERT model...")
|
75 |
self.model = SentenceTransformer("NeuML/pubmedbert-base-embeddings")
|
76 |
|
77 |
+
print(f"• Emergency embeddings shape: {self.emergency_emb.shape}")
|
78 |
+
print(f"• Treatment embeddings shape: {self.treatment_emb.shape}")
|
79 |
+
print("✅ Test environment initialized successfully")
|
80 |
+
|
81 |
self.logger.info("Test environment initialized successfully")
|
82 |
self.logger.info(f"Emergency embeddings shape: {self.emergency_emb.shape}")
|
83 |
self.logger.info(f"Treatment embeddings shape: {self.treatment_emb.shape}")
|
84 |
|
85 |
except FileNotFoundError as e:
|
86 |
+
print(f"❌ File not found: {e}")
|
87 |
self.logger.error(f"File not found: {e}")
|
88 |
raise
|
89 |
except Exception as e:
|
90 |
+
print(f"❌ Error during initialization: {e}")
|
91 |
self.logger.error(f"Error during initialization: {e}")
|
92 |
raise
|
93 |
|
|
|
111 |
|
112 |
def test_embedding_dimensions(self):
|
113 |
"""Test embedding dimensions and data quality."""
|
114 |
+
print("\n=== Phase 3: Embedding Validation ===")
|
115 |
self.logger.info("\n=== Embedding Validation Report ===")
|
116 |
|
117 |
try:
|
118 |
# Basic dimension checks
|
119 |
+
print("• Checking embedding dimensions...")
|
120 |
assert self.emergency_emb.shape[1] == 768, "Emergency embedding dimension should be 768"
|
121 |
assert self.treatment_emb.shape[1] == 768, "Treatment embedding dimension should be 768"
|
122 |
+
print(f"✓ Emergency dimensions: {self.emergency_emb.shape}")
|
123 |
+
print(f"✓ Treatment dimensions: {self.treatment_emb.shape}")
|
124 |
|
125 |
# Count verification
|
126 |
+
print("• Verifying chunk count consistency...")
|
127 |
assert len(self.emergency_chunks) == self.emergency_emb.shape[0], \
|
128 |
"Emergency chunks count mismatch"
|
129 |
assert len(self.treatment_chunks) == self.treatment_emb.shape[0], \
|
130 |
"Treatment chunks count mismatch"
|
131 |
+
print(f"✓ Emergency: {len(self.emergency_chunks)} chunks = {self.emergency_emb.shape[0]} embeddings")
|
132 |
+
print(f"✓ Treatment: {len(self.treatment_chunks)} chunks = {self.treatment_emb.shape[0]} embeddings")
|
133 |
|
134 |
# Data quality checks
|
135 |
+
print("• Performing data quality checks...")
|
136 |
for name, emb in [("Emergency", self.emergency_emb),
|
137 |
("Treatment", self.treatment_emb)]:
|
138 |
# Check for NaN and Inf
|
|
|
140 |
assert not np.isinf(emb).any(), f"{name} contains Inf values"
|
141 |
|
142 |
# Value distribution analysis
|
143 |
+
print(f"\n📊 {name} Embeddings Statistics:")
|
144 |
+
print(f"• Range: {np.min(emb):.3f} to {np.max(emb):.3f}")
|
145 |
+
print(f"• Mean: {np.mean(emb):.3f}")
|
146 |
+
print(f"• Std: {np.std(emb):.3f}")
|
147 |
+
|
148 |
self.logger.info(f"\n{name} Embeddings Statistics:")
|
149 |
self.logger.info(f"- Range: {np.min(emb):.3f} to {np.max(emb):.3f}")
|
150 |
self.logger.info(f"- Mean: {np.mean(emb):.3f}")
|
151 |
self.logger.info(f"- Std: {np.std(emb):.3f}")
|
152 |
|
153 |
+
print("\n✅ All embedding validations passed")
|
154 |
self.logger.info("\n✅ All embedding validations passed")
|
155 |
|
156 |
except AssertionError as e:
|
157 |
+
print(f"❌ Validation failed: {str(e)}")
|
158 |
self.logger.error(f"Validation failed: {str(e)}")
|
159 |
raise
|
160 |
|
161 |
def test_multiple_known_item_search(self):
|
162 |
"""Test ANNOY search with multiple random samples."""
|
163 |
+
print("\n=== Phase 4: Multiple Known-Item Search Test ===")
|
164 |
self.logger.info("\n=== Multiple Known-Item Search Test ===")
|
165 |
|
166 |
+
print("• Loading emergency index...")
|
167 |
emergency_index = AnnoyIndex(768, 'angular')
|
168 |
emergency_index.load(str(self.indices_dir / "emergency_index.ann"))
|
169 |
|
170 |
# Test 20 random samples
|
171 |
+
print("• Selecting 20 random samples for self-retrieval test...")
|
172 |
test_indices = np.random.choice(
|
173 |
self.emergency_emb.shape[0],
|
174 |
size=20,
|
|
|
176 |
)
|
177 |
|
178 |
success_count = 0
|
179 |
+
print("• Testing self-retrieval for each sample...")
|
180 |
+
for i, test_idx in enumerate(test_indices, 1):
|
181 |
try:
|
182 |
test_emb = self.emergency_emb[test_idx]
|
183 |
indices, distances = self._safe_search(emergency_index, test_emb)
|
184 |
|
185 |
if indices is None:
|
186 |
+
print(f" {i}/20: ❌ Search failed for index {test_idx}")
|
187 |
continue
|
188 |
|
189 |
# Verify self-retrieval
|
190 |
assert indices[0] == test_idx, f"Self-retrieval failed for index {test_idx}"
|
191 |
assert distances[0] < 0.0001, f"Self-distance too large for index {test_idx}"
|
192 |
success_count += 1
|
193 |
+
print(f" {i}/20: ✓ Index {test_idx} (distance: {distances[0]:.6f})")
|
194 |
|
195 |
except AssertionError as e:
|
196 |
+
print(f" {i}/20: ❌ Index {test_idx} failed: {str(e)}")
|
197 |
self.logger.warning(f"Test failed for index {test_idx}: {str(e)}")
|
198 |
|
199 |
+
print(f"\n📊 Self-Retrieval Results: {success_count}/20 tests passed ({success_count/20*100:.1f}%)")
|
200 |
self.logger.info(f"\n✅ {success_count}/20 self-retrieval tests passed")
|
201 |
assert success_count >= 18, "Less than 90% of self-retrieval tests passed"
|
202 |
+
print("✅ Multiple known-item search test passed")
|
203 |
|
204 |
def test_balanced_cross_dataset_search(self):
|
205 |
"""Test search across both emergency and treatment datasets."""
|
206 |
+
print("\n=== Phase 5: Cross-Dataset Search Test ===")
|
207 |
self.logger.info("\n=== Balanced Cross-Dataset Search Test ===")
|
208 |
|
209 |
# Initialize indices
|
210 |
+
print("• Loading ANNOY indices...")
|
211 |
emergency_index = AnnoyIndex(768, 'angular')
|
212 |
treatment_index = AnnoyIndex(768, 'angular')
|
213 |
|
214 |
try:
|
215 |
emergency_index.load(str(self.indices_dir / "emergency_index.ann"))
|
216 |
treatment_index.load(str(self.indices_dir / "treatment_index.ann"))
|
217 |
+
print("✓ Emergency and treatment indices loaded")
|
218 |
|
219 |
# Test queries
|
220 |
test_queries = [
|
|
|
223 |
"What are the emergency procedures for anaphylactic shock?"
|
224 |
]
|
225 |
|
226 |
+
print(f"• Testing {len(test_queries)} medical queries...")
|
227 |
+
|
228 |
+
for query_num, query in enumerate(test_queries, 1):
|
229 |
+
print(f"\n🔍 Query {query_num}/3: {query}")
|
230 |
|
231 |
# Generate query vector
|
232 |
+
print("• Generating query embedding...")
|
233 |
query_emb = self.model.encode([query])[0]
|
234 |
|
235 |
# Get top-5 results from each dataset
|
236 |
+
print("• Searching both datasets...")
|
237 |
e_indices, e_distances = self._safe_search(emergency_index, query_emb, k=5)
|
238 |
t_indices, t_distances = self._safe_search(treatment_index, query_emb, k=5)
|
239 |
|
240 |
if None in [e_indices, e_distances, t_indices, t_distances]:
|
241 |
+
print("❌ Search failed for one or both datasets")
|
242 |
self.logger.error("Search failed for one or both datasets")
|
243 |
continue
|
244 |
|
245 |
# Print first sentence of each result
|
246 |
+
print(f"\n📋 Emergency Dataset Results:")
|
247 |
for i, (idx, dist) in enumerate(zip(e_indices, e_distances), 1):
|
248 |
text = self.emergency_chunks[idx]['text']
|
249 |
first_sentence = text.split('.')[0] + '.'
|
250 |
+
print(f" E-{i} (distance: {dist:.3f}): {first_sentence[:80]}...")
|
|
|
251 |
|
252 |
+
print(f"\n📋 Treatment Dataset Results:")
|
253 |
for i, (idx, dist) in enumerate(zip(t_indices, t_distances), 1):
|
254 |
text = self.treatment_chunks[idx]['text']
|
255 |
first_sentence = text.split('.')[0] + '.'
|
256 |
+
print(f" T-{i} (distance: {dist:.3f}): {first_sentence[:80]}...")
|
257 |
+
|
258 |
+
print("✓ Query completed")
|
259 |
|
260 |
except Exception as e:
|
261 |
+
print(f"❌ Test failed: {str(e)}")
|
262 |
self.logger.error(f"Test failed: {str(e)}")
|
263 |
raise
|
264 |
else:
|
265 |
+
print("\n✅ Cross-dataset search test completed")
|
266 |
self.logger.info("\n✅ Cross-dataset search test completed")
|
267 |
|
268 |
+
def main():
|
269 |
+
"""Run all embedding validation tests"""
|
270 |
+
print("\n" + "="*60)
|
271 |
+
print("COMPREHENSIVE EMBEDDING VALIDATION TEST SUITE")
|
272 |
+
print("="*60)
|
273 |
+
|
274 |
test = TestEmbeddingValidation()
|
275 |
test.setup_class()
|
276 |
+
|
277 |
+
try:
|
278 |
+
test.test_embedding_dimensions()
|
279 |
+
test.test_multiple_known_item_search()
|
280 |
+
test.test_balanced_cross_dataset_search()
|
281 |
+
|
282 |
+
print("\n" + "="*60)
|
283 |
+
print("🎉 ALL EMBEDDING VALIDATION TESTS COMPLETED SUCCESSFULLY!")
|
284 |
+
print("="*60)
|
285 |
+
print("✅ Embedding dimensions validated")
|
286 |
+
print("✅ Self-retrieval accuracy confirmed")
|
287 |
+
print("✅ Cross-dataset search functionality verified")
|
288 |
+
print("="*60)
|
289 |
+
|
290 |
+
except Exception as e:
|
291 |
+
print("\n" + "="*60)
|
292 |
+
print("❌ EMBEDDING VALIDATION TESTS FAILED!")
|
293 |
+
print(f"Error: {str(e)}")
|
294 |
+
print("="*60)
|
295 |
+
|
296 |
+
if __name__ == "__main__":
|
297 |
+
main()
|
tests/test_end_to_end_pipeline.py
ADDED
@@ -0,0 +1,473 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
"""
|
3 |
+
End-to-End Pipeline Script Test for OnCall.ai
|
4 |
+
|
5 |
+
Tests the complete pipeline:
|
6 |
+
User Input → UserPrompt Processing → Retrieval → Generation → Structured Medical Advice
|
7 |
+
|
8 |
+
This script validates the entire workflow with realistic medical queries,
|
9 |
+
simulating the user confirmation process and generating final medical advice.
|
10 |
+
|
11 |
+
Author: OnCall.ai Team
|
12 |
+
Date: 2025-07-31
|
13 |
+
"""
|
14 |
+
|
15 |
+
import sys
|
16 |
+
import os
|
17 |
+
from pathlib import Path
|
18 |
+
import logging
|
19 |
+
import json
|
20 |
+
import traceback
|
21 |
+
from datetime import datetime
|
22 |
+
from typing import Dict, List, Any, Optional
|
23 |
+
|
24 |
+
# Add src directory to Python path
|
25 |
+
current_dir = Path(__file__).parent
|
26 |
+
project_root = current_dir.parent
|
27 |
+
src_dir = project_root / "src"
|
28 |
+
sys.path.insert(0, str(src_dir))
|
29 |
+
|
30 |
+
# Import all pipeline modules
|
31 |
+
try:
|
32 |
+
from user_prompt import UserPromptProcessor
|
33 |
+
from retrieval import BasicRetrievalSystem
|
34 |
+
from llm_clients import llm_Med42_70BClient
|
35 |
+
from generation import MedicalAdviceGenerator
|
36 |
+
from medical_conditions import CONDITION_KEYWORD_MAPPING
|
37 |
+
except ImportError as e:
|
38 |
+
print(f"❌ Import Error: {e}")
|
39 |
+
print(f"Current working directory: {os.getcwd()}")
|
40 |
+
print(f"Python path: {sys.path}")
|
41 |
+
sys.exit(1)
|
42 |
+
|
43 |
+
# Configure logging
|
44 |
+
logging.basicConfig(
|
45 |
+
level=logging.INFO,
|
46 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
47 |
+
handlers=[
|
48 |
+
logging.StreamHandler(),
|
49 |
+
logging.FileHandler(project_root / 'tests' / 'end_to_end_pipeline.log')
|
50 |
+
]
|
51 |
+
)
|
52 |
+
logger = logging.getLogger(__name__)
|
53 |
+
|
54 |
+
class EndToEndPipelineTest:
|
55 |
+
"""Complete pipeline test with realistic medical scenarios"""
|
56 |
+
|
57 |
+
def __init__(self):
|
58 |
+
"""Initialize test suite"""
|
59 |
+
self.start_time = datetime.now()
|
60 |
+
self.test_results = []
|
61 |
+
self.components_initialized = False
|
62 |
+
|
63 |
+
# Pipeline components
|
64 |
+
self.llm_client = None
|
65 |
+
self.retrieval_system = None
|
66 |
+
self.user_prompt_processor = None
|
67 |
+
self.medical_generator = None
|
68 |
+
|
69 |
+
def initialize_complete_pipeline(self):
|
70 |
+
"""Initialize all pipeline components"""
|
71 |
+
print("🔧 Initializing Complete OnCall.ai Pipeline...")
|
72 |
+
print("-" * 60)
|
73 |
+
|
74 |
+
try:
|
75 |
+
# Initialize LLM client
|
76 |
+
print("1. Initializing Med42-70B Client...")
|
77 |
+
self.llm_client = llm_Med42_70BClient()
|
78 |
+
print(" ✅ Med42-70B client ready")
|
79 |
+
|
80 |
+
# Initialize retrieval system
|
81 |
+
print("2. Initializing Dual-Index Retrieval System...")
|
82 |
+
self.retrieval_system = BasicRetrievalSystem()
|
83 |
+
print(" ✅ Emergency & Treatment indices loaded")
|
84 |
+
|
85 |
+
# Initialize user prompt processor
|
86 |
+
print("3. Initializing Multi-Level Prompt Processor...")
|
87 |
+
self.user_prompt_processor = UserPromptProcessor(
|
88 |
+
llm_client=self.llm_client,
|
89 |
+
retrieval_system=self.retrieval_system
|
90 |
+
)
|
91 |
+
print(" ✅ Fallback validation system ready")
|
92 |
+
|
93 |
+
# Initialize medical advice generator
|
94 |
+
print("4. Initializing Medical Advice Generator...")
|
95 |
+
self.medical_generator = MedicalAdviceGenerator(
|
96 |
+
llm_client=self.llm_client
|
97 |
+
)
|
98 |
+
print(" ✅ RAG generation system ready")
|
99 |
+
|
100 |
+
self.components_initialized = True
|
101 |
+
print(f"\n🎉 Complete pipeline initialized successfully!")
|
102 |
+
|
103 |
+
except Exception as e:
|
104 |
+
logger.error(f"Pipeline initialization failed: {e}")
|
105 |
+
print(f"❌ Initialization failed: {e}")
|
106 |
+
traceback.print_exc()
|
107 |
+
self.components_initialized = False
|
108 |
+
|
109 |
+
def get_realistic_test_queries(self) -> List[Dict[str, Any]]:
|
110 |
+
"""Define realistic medical queries for end-to-end testing"""
|
111 |
+
return [
|
112 |
+
{
|
113 |
+
"id": "e2e_001",
|
114 |
+
"query": "How to treat acute myocardial infarction in emergency department?",
|
115 |
+
"description": "Classic cardiac emergency with treatment focus",
|
116 |
+
"expected_intention": "treatment",
|
117 |
+
"category": "cardiac_emergency",
|
118 |
+
"simulated_confirmation": "yes"
|
119 |
+
},
|
120 |
+
{
|
121 |
+
"id": "e2e_002",
|
122 |
+
"query": "Patient presenting with severe chest pain and shortness of breath",
|
123 |
+
"description": "Symptom-based emergency requiring assessment and treatment",
|
124 |
+
"expected_intention": "diagnosis",
|
125 |
+
"category": "multi_symptom",
|
126 |
+
"simulated_confirmation": "yes"
|
127 |
+
},
|
128 |
+
{
|
129 |
+
"id": "e2e_003",
|
130 |
+
"query": "What are the emergency protocols for acute stroke management?",
|
131 |
+
"description": "Neurological emergency with protocol focus",
|
132 |
+
"expected_intention": "treatment",
|
133 |
+
"category": "neurological_emergency",
|
134 |
+
"simulated_confirmation": "yes"
|
135 |
+
},
|
136 |
+
{
|
137 |
+
"id": "e2e_004",
|
138 |
+
"query": "Differential diagnosis for sudden onset chest pain in young adult",
|
139 |
+
"description": "Diagnostic reasoning query",
|
140 |
+
"expected_intention": "diagnosis",
|
141 |
+
"category": "differential_diagnosis",
|
142 |
+
"simulated_confirmation": "yes"
|
143 |
+
},
|
144 |
+
{
|
145 |
+
"id": "e2e_005",
|
146 |
+
"query": "Emergency management of pulmonary embolism",
|
147 |
+
"description": "Pulmonary emergency requiring immediate intervention",
|
148 |
+
"expected_intention": "treatment",
|
149 |
+
"category": "pulmonary_emergency",
|
150 |
+
"simulated_confirmation": "yes"
|
151 |
+
},
|
152 |
+
{
|
153 |
+
"id": "e2e_006",
|
154 |
+
"query": "How to cook pasta properly?",
|
155 |
+
"description": "Non-medical query - should be rejected",
|
156 |
+
"expected_intention": None,
|
157 |
+
"category": "non_medical",
|
158 |
+
"simulated_confirmation": "reject_expected"
|
159 |
+
}
|
160 |
+
]
|
161 |
+
|
162 |
+
def run_scripted_end_to_end_tests(self):
|
163 |
+
"""Execute complete end-to-end tests with realistic queries"""
|
164 |
+
if not self.components_initialized:
|
165 |
+
print("❌ Cannot run tests: pipeline not initialized")
|
166 |
+
return
|
167 |
+
|
168 |
+
test_queries = self.get_realistic_test_queries()
|
169 |
+
|
170 |
+
print(f"\n🚀 Starting End-to-End Pipeline Tests")
|
171 |
+
print(f"Total test scenarios: {len(test_queries)}")
|
172 |
+
print(f"Test started at: {self.start_time.strftime('%Y-%m-%d %H:%M:%S')}")
|
173 |
+
print("=" * 80)
|
174 |
+
|
175 |
+
# Execute all tests
|
176 |
+
for test_case in test_queries:
|
177 |
+
result = self._execute_single_pipeline_test(test_case)
|
178 |
+
self.test_results.append(result)
|
179 |
+
|
180 |
+
# Generate comprehensive report
|
181 |
+
self._generate_end_to_end_report()
|
182 |
+
self._save_end_to_end_results()
|
183 |
+
|
184 |
+
def _execute_single_pipeline_test(self, test_case: Dict[str, Any]) -> Dict[str, Any]:
|
185 |
+
"""Execute single test through complete pipeline"""
|
186 |
+
test_id = test_case["id"]
|
187 |
+
query = test_case["query"]
|
188 |
+
|
189 |
+
print(f"\n🧪 {test_id}: {test_case['description']}")
|
190 |
+
print(f"Query: '{query}'")
|
191 |
+
print(f"Expected: {test_case['expected_intention']} intention")
|
192 |
+
print("-" * 70)
|
193 |
+
|
194 |
+
pipeline_start = datetime.now()
|
195 |
+
result = {
|
196 |
+
"test_id": test_id,
|
197 |
+
"test_case": test_case,
|
198 |
+
"timestamp": datetime.now().isoformat(),
|
199 |
+
"success": False,
|
200 |
+
"error": None,
|
201 |
+
"total_pipeline_time": 0,
|
202 |
+
"pipeline_steps": {}
|
203 |
+
}
|
204 |
+
|
205 |
+
try:
|
206 |
+
# STEP 1: User Prompt Processing
|
207 |
+
print(" 🎯 Step 1: Condition extraction and validation...")
|
208 |
+
step1_start = datetime.now()
|
209 |
+
|
210 |
+
condition_result = self.user_prompt_processor.extract_condition_keywords(query)
|
211 |
+
step1_time = (datetime.now() - step1_start).total_seconds()
|
212 |
+
|
213 |
+
result["pipeline_steps"]["condition_extraction"] = {
|
214 |
+
"duration": step1_time,
|
215 |
+
"result": condition_result,
|
216 |
+
"condition_found": bool(condition_result.get('condition'))
|
217 |
+
}
|
218 |
+
|
219 |
+
print(f" Condition: {condition_result.get('condition', 'None')}")
|
220 |
+
print(f" Keywords: Emergency='{condition_result.get('emergency_keywords', 'None')}', Treatment='{condition_result.get('treatment_keywords', 'None')}'")
|
221 |
+
print(f" Time: {step1_time:.3f}s")
|
222 |
+
|
223 |
+
# Check if this is a non-medical query that should be rejected
|
224 |
+
if condition_result.get('type') == 'invalid_query':
|
225 |
+
print(" 🚫 Non-medical query correctly rejected")
|
226 |
+
result["pipeline_steps"]["rejection"] = {
|
227 |
+
"reason": "non_medical_query",
|
228 |
+
"message": condition_result.get('message', '')
|
229 |
+
}
|
230 |
+
result["success"] = test_case['category'] == 'non_medical'
|
231 |
+
return result
|
232 |
+
|
233 |
+
# STEP 2: User Confirmation (Simulated)
|
234 |
+
print(" 🤝 Step 2: User confirmation (simulated as 'yes')...")
|
235 |
+
confirmation = self.user_prompt_processor.handle_user_confirmation(condition_result)
|
236 |
+
|
237 |
+
result["pipeline_steps"]["confirmation"] = {
|
238 |
+
"type": confirmation.get('type', 'unknown'),
|
239 |
+
"simulated_response": test_case['simulated_confirmation']
|
240 |
+
}
|
241 |
+
|
242 |
+
if not condition_result.get('condition'):
|
243 |
+
print(" ⚠️ No condition extracted, skipping retrieval and generation")
|
244 |
+
result["pipeline_steps"]["pipeline_stopped"] = "no_condition"
|
245 |
+
return result
|
246 |
+
|
247 |
+
# STEP 3: Retrieval
|
248 |
+
print(" 🔍 Step 3: Medical guideline retrieval...")
|
249 |
+
step3_start = datetime.now()
|
250 |
+
|
251 |
+
search_query = f"{condition_result.get('emergency_keywords', '')} {condition_result.get('treatment_keywords', '')}".strip()
|
252 |
+
if not search_query:
|
253 |
+
search_query = condition_result.get('condition', query)
|
254 |
+
|
255 |
+
retrieval_results = self.retrieval_system.search(search_query, top_k=5)
|
256 |
+
step3_time = (datetime.now() - step3_start).total_seconds()
|
257 |
+
|
258 |
+
processed_results = retrieval_results.get('processed_results', [])
|
259 |
+
emergency_count = len([r for r in processed_results if r.get('type') == 'emergency'])
|
260 |
+
treatment_count = len([r for r in processed_results if r.get('type') == 'treatment'])
|
261 |
+
|
262 |
+
result["pipeline_steps"]["retrieval"] = {
|
263 |
+
"duration": step3_time,
|
264 |
+
"search_query": search_query,
|
265 |
+
"total_results": len(processed_results),
|
266 |
+
"emergency_results": emergency_count,
|
267 |
+
"treatment_results": treatment_count
|
268 |
+
}
|
269 |
+
|
270 |
+
print(f" Search Query: '{search_query}'")
|
271 |
+
print(f" Results: {len(processed_results)} total ({emergency_count} emergency, {treatment_count} treatment)")
|
272 |
+
print(f" Time: {step3_time:.3f}s")
|
273 |
+
|
274 |
+
# STEP 4: Medical Advice Generation
|
275 |
+
print(" 🧠 Step 4: Medical advice generation...")
|
276 |
+
step4_start = datetime.now()
|
277 |
+
|
278 |
+
# Determine intention (simulate intelligent detection)
|
279 |
+
intention = test_case.get('expected_intention')
|
280 |
+
|
281 |
+
medical_advice = self.medical_generator.generate_medical_advice(
|
282 |
+
user_query=query,
|
283 |
+
retrieval_results=retrieval_results,
|
284 |
+
intention=intention
|
285 |
+
)
|
286 |
+
step4_time = (datetime.now() - step4_start).total_seconds()
|
287 |
+
|
288 |
+
result["pipeline_steps"]["generation"] = {
|
289 |
+
"duration": step4_time,
|
290 |
+
"intention_used": intention,
|
291 |
+
"confidence_score": medical_advice.get('confidence_score', 0.0),
|
292 |
+
"advice_length": len(medical_advice.get('medical_advice', '')),
|
293 |
+
"chunks_used": medical_advice.get('query_metadata', {}).get('total_chunks_used', 0)
|
294 |
+
}
|
295 |
+
|
296 |
+
print(f" Intention: {intention}")
|
297 |
+
print(f" Confidence: {medical_advice.get('confidence_score', 0.0):.2f}")
|
298 |
+
print(f" Advice Length: {len(medical_advice.get('medical_advice', ''))} chars")
|
299 |
+
print(f" Chunks Used: {medical_advice.get('query_metadata', {}).get('total_chunks_used', 0)}")
|
300 |
+
print(f" Time: {step4_time:.3f}s")
|
301 |
+
|
302 |
+
# STEP 5: Results Summary
|
303 |
+
total_time = (datetime.now() - pipeline_start).total_seconds()
|
304 |
+
result["total_pipeline_time"] = total_time
|
305 |
+
result["final_medical_advice"] = medical_advice
|
306 |
+
result["success"] = True
|
307 |
+
|
308 |
+
print(f"\n ✅ Pipeline completed successfully!")
|
309 |
+
print(f" 📊 Total Time: {total_time:.3f}s")
|
310 |
+
print(f" 🩺 Medical Advice Preview:")
|
311 |
+
print(f" {medical_advice.get('medical_advice', 'No advice generated')[:150]}...")
|
312 |
+
|
313 |
+
except Exception as e:
|
314 |
+
total_time = (datetime.now() - pipeline_start).total_seconds()
|
315 |
+
result["total_pipeline_time"] = total_time
|
316 |
+
result["error"] = str(e)
|
317 |
+
result["traceback"] = traceback.format_exc()
|
318 |
+
|
319 |
+
logger.error(f"Pipeline test {test_id} failed: {e}")
|
320 |
+
print(f" ❌ Pipeline failed: {e}")
|
321 |
+
|
322 |
+
return result
|
323 |
+
|
324 |
+
def _determine_extraction_source(self, condition_result: Dict) -> str:
|
325 |
+
"""Determine how the condition was extracted"""
|
326 |
+
if condition_result.get('semantic_confidence') is not None:
|
327 |
+
return "semantic_search"
|
328 |
+
elif condition_result.get('generic_confidence') is not None:
|
329 |
+
return "generic_search"
|
330 |
+
elif condition_result.get('condition') in CONDITION_KEYWORD_MAPPING:
|
331 |
+
return "predefined_mapping"
|
332 |
+
else:
|
333 |
+
return "llm_extraction"
|
334 |
+
|
335 |
+
def _generate_end_to_end_report(self):
|
336 |
+
"""Generate comprehensive end-to-end test report"""
|
337 |
+
end_time = datetime.now()
|
338 |
+
total_duration = (end_time - self.start_time).total_seconds()
|
339 |
+
|
340 |
+
successful_tests = [r for r in self.test_results if r['success']]
|
341 |
+
failed_tests = [r for r in self.test_results if not r['success']]
|
342 |
+
|
343 |
+
print("\n" + "=" * 80)
|
344 |
+
print("📊 END-TO-END PIPELINE TEST REPORT")
|
345 |
+
print("=" * 80)
|
346 |
+
|
347 |
+
# Overall Statistics
|
348 |
+
print(f"🕐 Execution Summary:")
|
349 |
+
print(f" Test session duration: {total_duration:.3f}s")
|
350 |
+
print(f" Average per test: {total_duration/len(self.test_results):.3f}s")
|
351 |
+
|
352 |
+
print(f"\n📈 Pipeline Results:")
|
353 |
+
print(f" Total tests: {len(self.test_results)}")
|
354 |
+
print(f" Successful: {len(successful_tests)} ✅")
|
355 |
+
print(f" Failed: {len(failed_tests)} ❌")
|
356 |
+
print(f" Success rate: {len(successful_tests)/len(self.test_results)*100:.1f}%")
|
357 |
+
|
358 |
+
# Performance Analysis
|
359 |
+
if successful_tests:
|
360 |
+
print(f"\n⚡ Performance Analysis:")
|
361 |
+
|
362 |
+
# Calculate average times for each step
|
363 |
+
step_times = {}
|
364 |
+
for result in successful_tests:
|
365 |
+
for step_name, step_data in result.get('pipeline_steps', {}).items():
|
366 |
+
if 'duration' in step_data:
|
367 |
+
if step_name not in step_times:
|
368 |
+
step_times[step_name] = []
|
369 |
+
step_times[step_name].append(step_data['duration'])
|
370 |
+
|
371 |
+
for step_name, times in step_times.items():
|
372 |
+
avg_time = sum(times) / len(times)
|
373 |
+
print(f" {step_name.replace('_', ' ').title()}: {avg_time:.3f}s average")
|
374 |
+
|
375 |
+
# Overall pipeline performance
|
376 |
+
total_times = [r['total_pipeline_time'] for r in successful_tests]
|
377 |
+
avg_total = sum(total_times) / len(total_times)
|
378 |
+
print(f" Complete Pipeline: {avg_total:.3f}s average")
|
379 |
+
|
380 |
+
# Detailed Results
|
381 |
+
print(f"\n📝 Detailed Test Results:")
|
382 |
+
for result in self.test_results:
|
383 |
+
test_case = result['test_case']
|
384 |
+
status = "✅ PASS" if result['success'] else "❌ FAIL"
|
385 |
+
|
386 |
+
print(f"\n 📋 {result['test_id']}: {status}")
|
387 |
+
print(f" Query: '{test_case['query']}'")
|
388 |
+
print(f" Category: {test_case['category']}")
|
389 |
+
print(f" Total Time: {result['total_pipeline_time']:.3f}s")
|
390 |
+
|
391 |
+
if result['success']:
|
392 |
+
steps = result.get('pipeline_steps', {})
|
393 |
+
if 'condition_extraction' in steps:
|
394 |
+
condition = steps['condition_extraction']['result'].get('condition', 'None')
|
395 |
+
print(f" Condition Extracted: {condition}")
|
396 |
+
|
397 |
+
if 'generation' in steps:
|
398 |
+
confidence = steps['generation'].get('confidence_score', 0.0)
|
399 |
+
chunks = steps['generation'].get('chunks_used', 0)
|
400 |
+
print(f" Generation: {confidence:.2f} confidence, {chunks} chunks")
|
401 |
+
|
402 |
+
if 'final_medical_advice' in result:
|
403 |
+
advice = result['final_medical_advice'].get('medical_advice', '')
|
404 |
+
print(f" Advice Preview: {advice[:100]}...")
|
405 |
+
else:
|
406 |
+
if result.get('error'):
|
407 |
+
print(f" Error: {result['error']}")
|
408 |
+
elif 'rejection' in result.get('pipeline_steps', {}):
|
409 |
+
print(f" Rejected: {result['pipeline_steps']['rejection']['reason']}")
|
410 |
+
|
411 |
+
print("\n" + "=" * 80)
|
412 |
+
|
413 |
+
def _save_end_to_end_results(self):
|
414 |
+
"""Save detailed test results to JSON file"""
|
415 |
+
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
|
416 |
+
filename = project_root / 'tests' / f'end_to_end_pipeline_results_{timestamp}.json'
|
417 |
+
|
418 |
+
try:
|
419 |
+
comprehensive_results = {
|
420 |
+
"test_metadata": {
|
421 |
+
"test_type": "end_to_end_pipeline",
|
422 |
+
"timestamp": datetime.now().isoformat(),
|
423 |
+
"session_start": self.start_time.isoformat(),
|
424 |
+
"total_duration_seconds": (datetime.now() - self.start_time).total_seconds(),
|
425 |
+
"total_tests": len(self.test_results),
|
426 |
+
"successful_tests": len([r for r in self.test_results if r['success']]),
|
427 |
+
"failed_tests": len([r for r in self.test_results if not r['success']])
|
428 |
+
},
|
429 |
+
"pipeline_results": self.test_results,
|
430 |
+
"component_status": {
|
431 |
+
"user_prompt_processor": "operational",
|
432 |
+
"retrieval_system": "operational",
|
433 |
+
"medical_generator": "operational",
|
434 |
+
"med42_llm_client": "operational"
|
435 |
+
}
|
436 |
+
}
|
437 |
+
|
438 |
+
with open(filename, 'w', encoding='utf-8') as f:
|
439 |
+
json.dump(comprehensive_results, f, indent=2, ensure_ascii=False)
|
440 |
+
|
441 |
+
print(f"📁 End-to-end test results saved to: {filename}")
|
442 |
+
|
443 |
+
except Exception as e:
|
444 |
+
logger.error(f"Failed to save test results: {e}")
|
445 |
+
print(f"⚠️ Failed to save test results: {e}")
|
446 |
+
|
447 |
+
def main():
|
448 |
+
"""Main execution function"""
|
449 |
+
print("🏥 OnCall.ai Complete End-to-End Pipeline Test")
|
450 |
+
print("Testing: User Input → UserPrompt → Retrieval → Generation")
|
451 |
+
print("=" * 70)
|
452 |
+
|
453 |
+
# Initialize test suite
|
454 |
+
test_suite = EndToEndPipelineTest()
|
455 |
+
|
456 |
+
# Initialize complete pipeline
|
457 |
+
test_suite.initialize_complete_pipeline()
|
458 |
+
|
459 |
+
if not test_suite.components_initialized:
|
460 |
+
print("❌ Pipeline initialization failed. Cannot proceed with testing.")
|
461 |
+
return 1
|
462 |
+
|
463 |
+
# Run scripted end-to-end tests
|
464 |
+
test_suite.run_scripted_end_to_end_tests()
|
465 |
+
|
466 |
+
print(f"\n🎯 End-to-end testing completed!")
|
467 |
+
print("Next step: Create Gradio interface for interactive testing")
|
468 |
+
|
469 |
+
return 0
|
470 |
+
|
471 |
+
if __name__ == "__main__":
|
472 |
+
exit_code = main()
|
473 |
+
sys.exit(exit_code)
|
tests/test_multilevel_fallback_validation.py
ADDED
@@ -0,0 +1,553 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
"""
|
3 |
+
Multi-Level Fallback Validation Test Suite for OnCall.ai
|
4 |
+
|
5 |
+
This test specifically validates the 5-level fallback mechanism:
|
6 |
+
Level 1: Predefined Mapping (Fast Path)
|
7 |
+
Level 2: Llama3-Med42-70B Extraction
|
8 |
+
Level 3: Semantic Search Fallback
|
9 |
+
Level 4: Medical Query Validation
|
10 |
+
Level 5: Generic Medical Search
|
11 |
+
|
12 |
+
Author: OnCall.ai Team
|
13 |
+
Date: 2025-07-30
|
14 |
+
"""
|
15 |
+
|
16 |
+
import sys
|
17 |
+
import os
|
18 |
+
from pathlib import Path
|
19 |
+
import logging
|
20 |
+
import json
|
21 |
+
import traceback
|
22 |
+
from datetime import datetime
|
23 |
+
from typing import Dict, List, Any, Optional
|
24 |
+
|
25 |
+
# Add src directory to Python path
|
26 |
+
current_dir = Path(__file__).parent
|
27 |
+
project_root = current_dir.parent
|
28 |
+
src_dir = project_root / "src"
|
29 |
+
sys.path.insert(0, str(src_dir))
|
30 |
+
|
31 |
+
# Import our modules
|
32 |
+
try:
|
33 |
+
from user_prompt import UserPromptProcessor
|
34 |
+
from retrieval import BasicRetrievalSystem
|
35 |
+
from llm_clients import llm_Med42_70BClient
|
36 |
+
from medical_conditions import CONDITION_KEYWORD_MAPPING
|
37 |
+
except ImportError as e:
|
38 |
+
print(f"❌ Import Error: {e}")
|
39 |
+
print(f"Current working directory: {os.getcwd()}")
|
40 |
+
print(f"Python path: {sys.path}")
|
41 |
+
sys.exit(1)
|
42 |
+
|
43 |
+
# Configure logging
|
44 |
+
logging.basicConfig(
|
45 |
+
level=logging.INFO,
|
46 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
47 |
+
handlers=[
|
48 |
+
logging.StreamHandler(),
|
49 |
+
logging.FileHandler(project_root / 'tests' / 'multilevel_fallback_test.log')
|
50 |
+
]
|
51 |
+
)
|
52 |
+
logger = logging.getLogger(__name__)
|
53 |
+
|
54 |
+
class MultilevelFallbackTest:
|
55 |
+
"""Test suite specifically for the 5-level fallback mechanism"""
|
56 |
+
|
57 |
+
def __init__(self):
|
58 |
+
"""Initialize test suite"""
|
59 |
+
self.start_time = datetime.now()
|
60 |
+
self.results = []
|
61 |
+
self.components_initialized = False
|
62 |
+
|
63 |
+
# Component references
|
64 |
+
self.llm_client = None
|
65 |
+
self.retrieval_system = None
|
66 |
+
self.user_prompt_processor = None
|
67 |
+
|
68 |
+
def initialize_components(self):
|
69 |
+
"""Initialize all pipeline components"""
|
70 |
+
print("🔧 Initializing Components for Multilevel Fallback Test...")
|
71 |
+
print("-" * 60)
|
72 |
+
|
73 |
+
try:
|
74 |
+
# Initialize LLM client
|
75 |
+
print("1. Initializing Llama3-Med42-70B Client...")
|
76 |
+
self.llm_client = llm_Med42_70BClient()
|
77 |
+
print(" ✅ LLM client initialized")
|
78 |
+
|
79 |
+
# Initialize retrieval system
|
80 |
+
print("2. Initializing Retrieval System...")
|
81 |
+
self.retrieval_system = BasicRetrievalSystem()
|
82 |
+
print(" ✅ Retrieval system initialized")
|
83 |
+
|
84 |
+
# Initialize user prompt processor
|
85 |
+
print("3. Initializing User Prompt Processor...")
|
86 |
+
self.user_prompt_processor = UserPromptProcessor(
|
87 |
+
llm_client=self.llm_client,
|
88 |
+
retrieval_system=self.retrieval_system
|
89 |
+
)
|
90 |
+
print(" ✅ User prompt processor initialized")
|
91 |
+
|
92 |
+
self.components_initialized = True
|
93 |
+
print("\n🎉 All components initialized successfully!")
|
94 |
+
|
95 |
+
except Exception as e:
|
96 |
+
logger.error(f"Component initialization failed: {e}")
|
97 |
+
print(f"❌ Component initialization failed: {e}")
|
98 |
+
traceback.print_exc()
|
99 |
+
self.components_initialized = False
|
100 |
+
|
101 |
+
def get_multilevel_test_cases(self) -> List[Dict[str, Any]]:
|
102 |
+
"""Define test cases specifically targeting each fallback level"""
|
103 |
+
return [
|
104 |
+
# Level 1: Predefined Mapping Tests
|
105 |
+
{
|
106 |
+
"id": "level1_001",
|
107 |
+
"query": "acute myocardial infarction treatment",
|
108 |
+
"description": "Level 1: Direct predefined condition match",
|
109 |
+
"expected_level": 1,
|
110 |
+
"expected_condition": "acute myocardial infarction",
|
111 |
+
"expected_source": "predefined_mapping",
|
112 |
+
"category": "level1_predefined"
|
113 |
+
},
|
114 |
+
{
|
115 |
+
"id": "level1_002",
|
116 |
+
"query": "how to manage acute stroke?",
|
117 |
+
"description": "Level 1: Predefined stroke condition",
|
118 |
+
"expected_level": 1,
|
119 |
+
"expected_condition": "acute stroke",
|
120 |
+
"expected_source": "predefined_mapping",
|
121 |
+
"category": "level1_predefined"
|
122 |
+
},
|
123 |
+
{
|
124 |
+
"id": "level1_003",
|
125 |
+
"query": "pulmonary embolism emergency protocol",
|
126 |
+
"description": "Level 1: Predefined PE condition",
|
127 |
+
"expected_level": 1,
|
128 |
+
"expected_condition": "pulmonary embolism",
|
129 |
+
"expected_source": "predefined_mapping",
|
130 |
+
"category": "level1_predefined"
|
131 |
+
},
|
132 |
+
|
133 |
+
# Level 2: LLM Extraction Tests
|
134 |
+
{
|
135 |
+
"id": "level2_001",
|
136 |
+
"query": "patient with severe crushing chest pain radiating to left arm",
|
137 |
+
"description": "Level 2: Symptom-based query requiring LLM analysis",
|
138 |
+
"expected_level": 2,
|
139 |
+
"expected_condition": ["acute myocardial infarction", "acute coronary syndrome"],
|
140 |
+
"expected_source": "llm_extraction",
|
141 |
+
"category": "level2_llm"
|
142 |
+
},
|
143 |
+
{
|
144 |
+
"id": "level2_002",
|
145 |
+
"query": "sudden onset weakness on right side with speech difficulty",
|
146 |
+
"description": "Level 2: Neurological symptoms requiring LLM",
|
147 |
+
"expected_level": 2,
|
148 |
+
"expected_condition": ["acute stroke", "cerebrovascular accident"],
|
149 |
+
"expected_source": "llm_extraction",
|
150 |
+
"category": "level2_llm"
|
151 |
+
},
|
152 |
+
|
153 |
+
# Level 3: Semantic Search Tests
|
154 |
+
{
|
155 |
+
"id": "level3_001",
|
156 |
+
"query": "emergency management of cardiovascular crisis",
|
157 |
+
"description": "Level 3: Generic medical terms requiring semantic search",
|
158 |
+
"expected_level": 3,
|
159 |
+
"expected_source": "semantic_search",
|
160 |
+
"category": "level3_semantic"
|
161 |
+
},
|
162 |
+
{
|
163 |
+
"id": "level3_002",
|
164 |
+
"query": "urgent neurological intervention protocols",
|
165 |
+
"description": "Level 3: Medical terminology requiring semantic fallback",
|
166 |
+
"expected_level": 3,
|
167 |
+
"expected_source": "semantic_search",
|
168 |
+
"category": "level3_semantic"
|
169 |
+
},
|
170 |
+
|
171 |
+
# Level 4a: Non-Medical Query Rejection
|
172 |
+
{
|
173 |
+
"id": "level4a_001",
|
174 |
+
"query": "how to cook pasta properly?",
|
175 |
+
"description": "Level 4a: Non-medical query should be rejected",
|
176 |
+
"expected_level": 4,
|
177 |
+
"expected_result": "invalid_query",
|
178 |
+
"expected_source": "validation_rejection",
|
179 |
+
"category": "level4a_rejection"
|
180 |
+
},
|
181 |
+
{
|
182 |
+
"id": "level4a_002",
|
183 |
+
"query": "best programming language to learn in 2025",
|
184 |
+
"description": "Level 4a: Technology query should be rejected",
|
185 |
+
"expected_level": 4,
|
186 |
+
"expected_result": "invalid_query",
|
187 |
+
"expected_source": "validation_rejection",
|
188 |
+
"category": "level4a_rejection"
|
189 |
+
},
|
190 |
+
{
|
191 |
+
"id": "level4a_003",
|
192 |
+
"query": "weather forecast for tomorrow",
|
193 |
+
"description": "Level 4a: Weather query should be rejected",
|
194 |
+
"expected_level": 4,
|
195 |
+
"expected_result": "invalid_query",
|
196 |
+
"expected_source": "validation_rejection",
|
197 |
+
"category": "level4a_rejection"
|
198 |
+
},
|
199 |
+
|
200 |
+
# Level 4b + 5: Obscure Medical Terms → Generic Search
|
201 |
+
{
|
202 |
+
"id": "level4b_001",
|
203 |
+
"query": "rare hematologic malignancy treatment approaches",
|
204 |
+
"description": "Level 4b→5: Obscure medical query passing validation to generic search",
|
205 |
+
"expected_level": 5,
|
206 |
+
"expected_condition": "generic medical query",
|
207 |
+
"expected_source": "generic_search",
|
208 |
+
"category": "level4b_to_5"
|
209 |
+
},
|
210 |
+
{
|
211 |
+
"id": "level4b_002",
|
212 |
+
"query": "idiopathic thrombocytopenic purpura management guidelines",
|
213 |
+
"description": "Level 4b→5: Rare condition requiring generic medical search",
|
214 |
+
"expected_level": 5,
|
215 |
+
"expected_condition": "generic medical query",
|
216 |
+
"expected_source": "generic_search",
|
217 |
+
"category": "level4b_to_5"
|
218 |
+
},
|
219 |
+
{
|
220 |
+
"id": "level4b_003",
|
221 |
+
"query": "necrotizing fasciitis surgical intervention protocols",
|
222 |
+
"description": "Level 4b→5: Rare emergency condition → generic search",
|
223 |
+
"expected_level": 5,
|
224 |
+
"expected_condition": "generic medical query",
|
225 |
+
"expected_source": "generic_search",
|
226 |
+
"category": "level4b_to_5"
|
227 |
+
}
|
228 |
+
]
|
229 |
+
|
230 |
+
def run_single_fallback_test(self, test_case: Dict[str, Any]) -> Dict[str, Any]:
|
231 |
+
"""Execute a single fallback test case with level detection"""
|
232 |
+
test_id = test_case["id"]
|
233 |
+
query = test_case["query"]
|
234 |
+
|
235 |
+
print(f"\n🔍 {test_id}: {test_case['description']}")
|
236 |
+
print(f"Query: '{query}'")
|
237 |
+
print(f"Expected Level: {test_case.get('expected_level', 'Unknown')}")
|
238 |
+
print("-" * 70)
|
239 |
+
|
240 |
+
result = {
|
241 |
+
"test_id": test_id,
|
242 |
+
"test_case": test_case,
|
243 |
+
"timestamp": datetime.now().isoformat(),
|
244 |
+
"success": False,
|
245 |
+
"error": None,
|
246 |
+
"execution_time": 0,
|
247 |
+
"detected_level": None,
|
248 |
+
"condition_result": {}
|
249 |
+
}
|
250 |
+
|
251 |
+
start_time = datetime.now()
|
252 |
+
|
253 |
+
try:
|
254 |
+
# Execute condition extraction with level detection
|
255 |
+
print("🎯 Executing multilevel fallback...")
|
256 |
+
condition_start = datetime.now()
|
257 |
+
|
258 |
+
condition_result = self.user_prompt_processor.extract_condition_keywords(query)
|
259 |
+
condition_time = (datetime.now() - condition_start).total_seconds()
|
260 |
+
|
261 |
+
# Detect which level was used
|
262 |
+
detected_level = self._detect_fallback_level(condition_result)
|
263 |
+
|
264 |
+
result["condition_result"] = condition_result
|
265 |
+
result["detected_level"] = detected_level
|
266 |
+
result["execution_time"] = condition_time
|
267 |
+
|
268 |
+
print(f" ✅ Detected Level: {detected_level}")
|
269 |
+
print(f" Condition: {condition_result.get('condition', 'None')}")
|
270 |
+
print(f" Emergency Keywords: {condition_result.get('emergency_keywords', 'None')}")
|
271 |
+
print(f" Treatment Keywords: {condition_result.get('treatment_keywords', 'None')}")
|
272 |
+
print(f" Execution Time: {condition_time:.3f}s")
|
273 |
+
|
274 |
+
# Validate expected behavior
|
275 |
+
validation_result = self._validate_expected_behavior(test_case, detected_level, condition_result)
|
276 |
+
result.update(validation_result)
|
277 |
+
|
278 |
+
if result["success"]:
|
279 |
+
print(" 🎉 Test PASSED - Expected behavior achieved")
|
280 |
+
else:
|
281 |
+
print(f" ⚠️ Test PARTIAL - {result.get('validation_message', 'Unexpected behavior')}")
|
282 |
+
|
283 |
+
except Exception as e:
|
284 |
+
total_time = (datetime.now() - start_time).total_seconds()
|
285 |
+
result["execution_time"] = total_time
|
286 |
+
result["error"] = str(e)
|
287 |
+
result["traceback"] = traceback.format_exc()
|
288 |
+
|
289 |
+
logger.error(f"Test {test_id} failed: {e}")
|
290 |
+
print(f" ❌ Test FAILED: {e}")
|
291 |
+
|
292 |
+
return result
|
293 |
+
|
294 |
+
def _detect_fallback_level(self, condition_result: Dict[str, Any]) -> int:
|
295 |
+
"""
|
296 |
+
Detect which fallback level was used based on the condition result.
|
297 |
+
|
298 |
+
Fallback levels:
|
299 |
+
0: No result or unknown fallback level.
|
300 |
+
1: Predefined Mapping (Fast Path) - The condition matches a predefined mapping.
|
301 |
+
2: Llama3-Med42-70B Extraction - The condition is extracted by the LLM.
|
302 |
+
3: Semantic Search Fallback - The result includes a semantic confidence score.
|
303 |
+
4: Medical Query Validation - The query is deemed invalid (e.g., 'invalid_query').
|
304 |
+
5: Generic Medical Search - The condition is identified as a generic medical query.
|
305 |
+
|
306 |
+
Args:
|
307 |
+
condition_result (Dict[str, Any]): The result of the condition extraction process.
|
308 |
+
|
309 |
+
Returns:
|
310 |
+
int: The detected fallback level (0-5).
|
311 |
+
"""
|
312 |
+
if not condition_result:
|
313 |
+
return 0 # No result
|
314 |
+
|
315 |
+
# Check for validation rejection (Level 4a)
|
316 |
+
if condition_result.get('type') == 'invalid_query':
|
317 |
+
return 4
|
318 |
+
|
319 |
+
# Check for generic search (Level 5)
|
320 |
+
if condition_result.get('condition') == 'generic medical query':
|
321 |
+
return 5
|
322 |
+
|
323 |
+
# Check for semantic search (Level 3)
|
324 |
+
if 'semantic_confidence' in condition_result:
|
325 |
+
return 3
|
326 |
+
|
327 |
+
# Check for predefined mapping (Level 1)
|
328 |
+
condition = condition_result.get('condition', '')
|
329 |
+
if condition and condition in CONDITION_KEYWORD_MAPPING:
|
330 |
+
return 1
|
331 |
+
|
332 |
+
# Otherwise assume LLM extraction (Level 2)
|
333 |
+
if condition:
|
334 |
+
return 2
|
335 |
+
|
336 |
+
return 0 # Unknown
|
337 |
+
|
338 |
+
def _validate_expected_behavior(self, test_case: Dict[str, Any], detected_level: int,
|
339 |
+
condition_result: Dict[str, Any]) -> Dict[str, Any]:
|
340 |
+
"""Validate if the test behaved as expected"""
|
341 |
+
expected_level = test_case.get('expected_level')
|
342 |
+
validation_result = {
|
343 |
+
"level_match": detected_level == expected_level,
|
344 |
+
"condition_match": False,
|
345 |
+
"success": False,
|
346 |
+
"validation_message": ""
|
347 |
+
}
|
348 |
+
|
349 |
+
# Check level match
|
350 |
+
if validation_result["level_match"]:
|
351 |
+
validation_result["validation_message"] += f"✅ Level {detected_level} as expected. "
|
352 |
+
else:
|
353 |
+
validation_result["validation_message"] += f"⚠️ Level {detected_level} != expected {expected_level}. "
|
354 |
+
|
355 |
+
# Check condition/result match based on test type
|
356 |
+
if test_case["category"] == "level4a_rejection":
|
357 |
+
# Should be rejected
|
358 |
+
validation_result["condition_match"] = condition_result.get('type') == 'invalid_query'
|
359 |
+
if validation_result["condition_match"]:
|
360 |
+
validation_result["validation_message"] += "✅ Query correctly rejected. "
|
361 |
+
else:
|
362 |
+
validation_result["validation_message"] += "⚠️ Query should have been rejected. "
|
363 |
+
|
364 |
+
elif test_case["category"] == "level4b_to_5":
|
365 |
+
# Should result in generic medical query
|
366 |
+
validation_result["condition_match"] = condition_result.get('condition') == 'generic medical query'
|
367 |
+
if validation_result["condition_match"]:
|
368 |
+
validation_result["validation_message"] += "✅ Generic medical search triggered. "
|
369 |
+
else:
|
370 |
+
validation_result["validation_message"] += "⚠️ Should trigger generic medical search. "
|
371 |
+
|
372 |
+
else:
|
373 |
+
# Check expected condition
|
374 |
+
expected_conditions = test_case.get('expected_condition', [])
|
375 |
+
if isinstance(expected_conditions, str):
|
376 |
+
expected_conditions = [expected_conditions]
|
377 |
+
|
378 |
+
actual_condition = condition_result.get('condition', '')
|
379 |
+
validation_result["condition_match"] = any(
|
380 |
+
expected.lower() in actual_condition.lower()
|
381 |
+
for expected in expected_conditions
|
382 |
+
)
|
383 |
+
|
384 |
+
if validation_result["condition_match"]:
|
385 |
+
validation_result["validation_message"] += f"✅ Condition '{actual_condition}' matches expected. "
|
386 |
+
else:
|
387 |
+
validation_result["validation_message"] += f"⚠️ Condition '{actual_condition}' != expected {expected_conditions}. "
|
388 |
+
|
389 |
+
# Overall success
|
390 |
+
validation_result["success"] = validation_result["level_match"] or validation_result["condition_match"]
|
391 |
+
|
392 |
+
return validation_result
|
393 |
+
|
394 |
+
def run_all_fallback_tests(self):
|
395 |
+
"""Execute all fallback tests and generate report"""
|
396 |
+
if not self.components_initialized:
|
397 |
+
print("❌ Cannot run tests: components not initialized")
|
398 |
+
return
|
399 |
+
|
400 |
+
test_cases = self.get_multilevel_test_cases()
|
401 |
+
|
402 |
+
print(f"\n🚀 Starting Multilevel Fallback Test Suite")
|
403 |
+
print(f"Total test cases: {len(test_cases)}")
|
404 |
+
print(f"Test started at: {self.start_time.strftime('%Y-%m-%d %H:%M:%S')}")
|
405 |
+
print("=" * 80)
|
406 |
+
|
407 |
+
# Execute all tests
|
408 |
+
for test_case in test_cases:
|
409 |
+
result = self.run_single_fallback_test(test_case)
|
410 |
+
self.results.append(result)
|
411 |
+
|
412 |
+
# Generate report
|
413 |
+
self.generate_fallback_report()
|
414 |
+
self.save_fallback_results()
|
415 |
+
|
416 |
+
def generate_fallback_report(self):
|
417 |
+
"""Generate detailed fallback analysis report"""
|
418 |
+
end_time = datetime.now()
|
419 |
+
total_duration = (end_time - self.start_time).total_seconds()
|
420 |
+
|
421 |
+
successful_tests = [r for r in self.results if r['success']]
|
422 |
+
failed_tests = [r for r in self.results if not r['success']]
|
423 |
+
partial_tests = [r for r in self.results if not r['success'] and not r.get('error')]
|
424 |
+
|
425 |
+
print("\n" + "=" * 80)
|
426 |
+
print("📊 MULTILEVEL FALLBACK TEST REPORT")
|
427 |
+
print("=" * 80)
|
428 |
+
|
429 |
+
# Overall Statistics
|
430 |
+
print(f"🕐 Execution Summary:")
|
431 |
+
print(f" Total duration: {total_duration:.3f}s")
|
432 |
+
print(f" Average per test: {total_duration/len(self.results):.3f}s")
|
433 |
+
|
434 |
+
print(f"\n📈 Test Results:")
|
435 |
+
print(f" Total tests: {len(self.results)}")
|
436 |
+
print(f" Passed: {len(successful_tests)} ✅")
|
437 |
+
print(f" Partial: {len(partial_tests)} ⚠️")
|
438 |
+
print(f" Failed: {len(failed_tests)} ❌")
|
439 |
+
print(f" Success rate: {len(successful_tests)/len(self.results)*100:.1f}%")
|
440 |
+
|
441 |
+
# Level Distribution Analysis
|
442 |
+
level_distribution = {}
|
443 |
+
level_performance = {}
|
444 |
+
|
445 |
+
for result in self.results:
|
446 |
+
if not result.get('error'):
|
447 |
+
level = result.get('detected_level', 0)
|
448 |
+
level_distribution[level] = level_distribution.get(level, 0) + 1
|
449 |
+
|
450 |
+
if level not in level_performance:
|
451 |
+
level_performance[level] = []
|
452 |
+
level_performance[level].append(result['execution_time'])
|
453 |
+
|
454 |
+
print(f"\n🎯 Level Distribution Analysis:")
|
455 |
+
for level in sorted(level_distribution.keys()):
|
456 |
+
count = level_distribution[level]
|
457 |
+
avg_time = sum(level_performance[level]) / len(level_performance[level])
|
458 |
+
level_name = {
|
459 |
+
1: "Predefined Mapping",
|
460 |
+
2: "LLM Extraction",
|
461 |
+
3: "Semantic Search",
|
462 |
+
4: "Validation Rejection",
|
463 |
+
5: "Generic Search"
|
464 |
+
}.get(level, f"Unknown ({level})")
|
465 |
+
|
466 |
+
print(f" Level {level} ({level_name}): {count} tests, avg {avg_time:.3f}s")
|
467 |
+
|
468 |
+
# Category Analysis
|
469 |
+
categories = {}
|
470 |
+
for result in self.results:
|
471 |
+
category = result['test_case']['category']
|
472 |
+
if category not in categories:
|
473 |
+
categories[category] = {'total': 0, 'passed': 0}
|
474 |
+
categories[category]['total'] += 1
|
475 |
+
if result['success']:
|
476 |
+
categories[category]['passed'] += 1
|
477 |
+
|
478 |
+
print(f"\n📋 Category Analysis:")
|
479 |
+
for category, stats in categories.items():
|
480 |
+
success_rate = stats['passed'] / stats['total'] * 100
|
481 |
+
print(f" {category}: {stats['passed']}/{stats['total']} ({success_rate:.1f}%)")
|
482 |
+
|
483 |
+
# Detailed Results
|
484 |
+
print(f"\n📝 Detailed Test Results:")
|
485 |
+
for result in self.results:
|
486 |
+
test_case = result['test_case']
|
487 |
+
status = "✅ PASS" if result['success'] else ("❌ FAIL" if result.get('error') else "⚠️ PARTIAL")
|
488 |
+
|
489 |
+
print(f"\n {result['test_id']}: {status}")
|
490 |
+
print(f" Query: '{test_case['query']}'")
|
491 |
+
print(f" Expected Level: {test_case.get('expected_level', 'N/A')}")
|
492 |
+
print(f" Detected Level: {result.get('detected_level', 'N/A')}")
|
493 |
+
print(f" Condition: {result.get('condition_result', {}).get('condition', 'None')}")
|
494 |
+
print(f" Time: {result['execution_time']:.3f}s")
|
495 |
+
|
496 |
+
if result.get('validation_message'):
|
497 |
+
print(f" Validation: {result['validation_message']}")
|
498 |
+
|
499 |
+
if result.get('error'):
|
500 |
+
print(f" Error: {result['error']}")
|
501 |
+
|
502 |
+
print("\n" + "=" * 80)
|
503 |
+
|
504 |
+
def save_fallback_results(self):
|
505 |
+
"""Save detailed test results to JSON file"""
|
506 |
+
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
|
507 |
+
filename = project_root / 'tests' / f'multilevel_fallback_results_{timestamp}.json'
|
508 |
+
|
509 |
+
try:
|
510 |
+
comprehensive_results = {
|
511 |
+
"test_metadata": {
|
512 |
+
"timestamp": datetime.now().isoformat(),
|
513 |
+
"test_type": "multilevel_fallback_validation",
|
514 |
+
"total_duration_seconds": (datetime.now() - self.start_time).total_seconds(),
|
515 |
+
"total_tests": len(self.results),
|
516 |
+
"passed_tests": len([r for r in self.results if r['success']]),
|
517 |
+
"failed_tests": len([r for r in self.results if not r['success']])
|
518 |
+
},
|
519 |
+
"fallback_results": self.results
|
520 |
+
}
|
521 |
+
|
522 |
+
with open(filename, 'w', encoding='utf-8') as f:
|
523 |
+
json.dump(comprehensive_results, f, indent=2, ensure_ascii=False)
|
524 |
+
|
525 |
+
print(f"📁 Multilevel fallback results saved to: {filename}")
|
526 |
+
|
527 |
+
except Exception as e:
|
528 |
+
logger.error(f"Failed to save test results: {e}")
|
529 |
+
print(f"⚠️ Failed to save test results: {e}")
|
530 |
+
|
531 |
+
def main():
|
532 |
+
"""Main execution function"""
|
533 |
+
print("🏥 OnCall.ai Multilevel Fallback Validation Test")
|
534 |
+
print("=" * 60)
|
535 |
+
|
536 |
+
# Initialize test suite
|
537 |
+
test_suite = MultilevelFallbackTest()
|
538 |
+
|
539 |
+
# Initialize components
|
540 |
+
test_suite.initialize_components()
|
541 |
+
|
542 |
+
if not test_suite.components_initialized:
|
543 |
+
print("❌ Test suite initialization failed. Exiting.")
|
544 |
+
return 1
|
545 |
+
|
546 |
+
# Run all fallback tests
|
547 |
+
test_suite.run_all_fallback_tests()
|
548 |
+
|
549 |
+
return 0
|
550 |
+
|
551 |
+
if __name__ == "__main__":
|
552 |
+
exit_code = main()
|
553 |
+
sys.exit(exit_code)
|
tests/test_retrieval.py
ADDED
@@ -0,0 +1,206 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Test suite for BasicRetrievalSystem
|
3 |
+
This module tests the core retrieval functionality including:
|
4 |
+
- System initialization
|
5 |
+
- Basic search functionality
|
6 |
+
- Deduplication logic
|
7 |
+
- Result formatting
|
8 |
+
"""
|
9 |
+
|
10 |
+
import sys
|
11 |
+
import os
|
12 |
+
from pathlib import Path
|
13 |
+
import logging
|
14 |
+
|
15 |
+
print("\n=== Phase 1: Initializing Test Environment ===")
|
16 |
+
# Add src to python path
|
17 |
+
current_dir = Path(__file__).parent.resolve()
|
18 |
+
project_root = current_dir.parent
|
19 |
+
sys.path.append(str(project_root / "src"))
|
20 |
+
|
21 |
+
print(f"• Current directory: {current_dir}")
|
22 |
+
print(f"• Project root: {project_root}")
|
23 |
+
print(f"• Python path added: {project_root / 'src'}")
|
24 |
+
|
25 |
+
# Change working directory to project root for file access
|
26 |
+
os.chdir(project_root)
|
27 |
+
print(f"• Changed working directory to: {project_root}")
|
28 |
+
|
29 |
+
from retrieval import BasicRetrievalSystem #type: ignore
|
30 |
+
|
31 |
+
class TestRetrievalSystem:
|
32 |
+
"""Test suite for basic retrieval system functionality"""
|
33 |
+
|
34 |
+
def setup_class(self):
|
35 |
+
"""Initialize test environment"""
|
36 |
+
print("\n=== Phase 2: Setting up Test Environment ===")
|
37 |
+
|
38 |
+
# Setup logging to capture our logs
|
39 |
+
logging.basicConfig(
|
40 |
+
level=logging.INFO,
|
41 |
+
format='%(levelname)s:%(name)s:%(message)s',
|
42 |
+
handlers=[
|
43 |
+
logging.StreamHandler(),
|
44 |
+
logging.FileHandler('test_retrieval.log')
|
45 |
+
]
|
46 |
+
)
|
47 |
+
|
48 |
+
try:
|
49 |
+
print("• Initializing BasicRetrievalSystem...")
|
50 |
+
self.retrieval = BasicRetrievalSystem(embedding_dim=768)
|
51 |
+
print("✅ Retrieval system initialized successfully")
|
52 |
+
|
53 |
+
except Exception as e:
|
54 |
+
print(f"❌ Failed to initialize retrieval system: {e}")
|
55 |
+
raise
|
56 |
+
|
57 |
+
def test_system_initialization(self):
|
58 |
+
"""Test system initialization components"""
|
59 |
+
print("\n=== Phase 3: System Initialization Test ===")
|
60 |
+
|
61 |
+
print("• Checking embedding model...")
|
62 |
+
assert self.retrieval.embedding_model is not None, "Embedding model not loaded"
|
63 |
+
print("✓ Embedding model loaded")
|
64 |
+
|
65 |
+
print("• Checking emergency index...")
|
66 |
+
assert self.retrieval.emergency_index is not None, "Emergency index not loaded"
|
67 |
+
print("✓ Emergency index loaded")
|
68 |
+
|
69 |
+
print("• Checking treatment index...")
|
70 |
+
assert self.retrieval.treatment_index is not None, "Treatment index not loaded"
|
71 |
+
print("✓ Treatment index loaded")
|
72 |
+
|
73 |
+
print("• Checking chunk data...")
|
74 |
+
assert len(self.retrieval.emergency_chunks) > 0, "Emergency chunks not loaded"
|
75 |
+
assert len(self.retrieval.treatment_chunks) > 0, "Treatment chunks not loaded"
|
76 |
+
print(f"✓ Emergency chunks: {len(self.retrieval.emergency_chunks)}")
|
77 |
+
print(f"✓ Treatment chunks: {len(self.retrieval.treatment_chunks)}")
|
78 |
+
|
79 |
+
print("✅ System initialization test passed")
|
80 |
+
|
81 |
+
def test_basic_search_functionality(self):
|
82 |
+
"""Test basic search functionality with medical queries"""
|
83 |
+
print("\n=== Phase 4: Basic Search Functionality Test ===")
|
84 |
+
|
85 |
+
test_queries = [
|
86 |
+
"What is the treatment for acute myocardial infarction?",
|
87 |
+
"How to manage chest pain in emergency?",
|
88 |
+
"Acute stroke treatment protocol"
|
89 |
+
]
|
90 |
+
|
91 |
+
for i, query in enumerate(test_queries, 1):
|
92 |
+
print(f"\n🔍 Test Query {i}/3: {query}")
|
93 |
+
|
94 |
+
try:
|
95 |
+
results = self.retrieval.search(query)
|
96 |
+
|
97 |
+
# Basic structure checks
|
98 |
+
assert "query" in results, "Query not in results"
|
99 |
+
assert "processed_results" in results, "Processed results not found"
|
100 |
+
assert "total_results" in results, "Total results count missing"
|
101 |
+
|
102 |
+
processed_results = results["processed_results"]
|
103 |
+
print(f"• Results returned: {len(processed_results)}")
|
104 |
+
|
105 |
+
# Check result format and display ALL results
|
106 |
+
for j, result in enumerate(processed_results, 1): # Show ALL results
|
107 |
+
assert "type" in result, f"Result {j} missing 'type' field"
|
108 |
+
assert "text" in result, f"Result {j} missing 'text' field"
|
109 |
+
assert "distance" in result, f"Result {j} missing 'distance' field"
|
110 |
+
assert "chunk_id" in result, f"Result {j} missing 'chunk_id' field"
|
111 |
+
|
112 |
+
print(f" R-{j:2d} [{result['type']:9s}] (distance: {result['distance']:.3f}): {result['text'][:80]}...")
|
113 |
+
|
114 |
+
print(f"✓ Query {i} completed successfully")
|
115 |
+
|
116 |
+
except Exception as e:
|
117 |
+
print(f"❌ Query {i} failed: {e}")
|
118 |
+
raise
|
119 |
+
|
120 |
+
print("\n✅ Basic search functionality test passed")
|
121 |
+
|
122 |
+
def test_deduplication_logic(self):
|
123 |
+
"""Test the text-based deduplication logic"""
|
124 |
+
print("\n=== Phase 5: Deduplication Logic Test ===")
|
125 |
+
|
126 |
+
# Create test data with duplicate texts
|
127 |
+
test_results = [
|
128 |
+
{"text": "Sample text 1", "distance": 0.1, "type": "emergency", "chunk_id": 1},
|
129 |
+
{"text": "Sample text 1", "distance": 0.105, "type": "emergency", "chunk_id": 2}, # Duplicate text
|
130 |
+
{"text": "Sample text 3", "distance": 0.2, "type": "treatment", "chunk_id": 3},
|
131 |
+
{"text": "Sample text 4", "distance": 0.3, "type": "treatment", "chunk_id": 4}
|
132 |
+
]
|
133 |
+
|
134 |
+
print(f"• Original results: {len(test_results)}")
|
135 |
+
for i, result in enumerate(test_results, 1):
|
136 |
+
print(f" Test-{i}: distance={result['distance']}, type={result['type']}")
|
137 |
+
|
138 |
+
# Test deduplication
|
139 |
+
unique_results = self.retrieval._remove_duplicates(test_results)
|
140 |
+
|
141 |
+
print(f"• After deduplication: {len(unique_results)}")
|
142 |
+
for i, result in enumerate(unique_results, 1):
|
143 |
+
print(f" Kept-{i}: distance={result['distance']}, type={result['type']}")
|
144 |
+
|
145 |
+
# Verify deduplication worked
|
146 |
+
assert len(unique_results) < len(test_results), "Deduplication should remove duplicate texts"
|
147 |
+
print("✓ Text-based deduplication working correctly")
|
148 |
+
|
149 |
+
print("✅ Deduplication logic test passed")
|
150 |
+
|
151 |
+
def test_result_statistics(self):
|
152 |
+
"""Test result statistics and logging"""
|
153 |
+
print("\n=== Phase 6: Result Statistics Test ===")
|
154 |
+
|
155 |
+
query = "Emergency cardiac arrest management"
|
156 |
+
print(f"• Testing with query: {query}")
|
157 |
+
|
158 |
+
# Capture logs by running search
|
159 |
+
results = self.retrieval.search(query)
|
160 |
+
|
161 |
+
# Verify we get statistics
|
162 |
+
assert "total_results" in results, "Total results missing"
|
163 |
+
assert "processing_info" in results, "Processing info missing"
|
164 |
+
|
165 |
+
total_results = results["total_results"]
|
166 |
+
duplicates_removed = results["processing_info"]["duplicates_removed"]
|
167 |
+
|
168 |
+
print(f"• Total results: {total_results}")
|
169 |
+
print(f"• Duplicates removed: {duplicates_removed}")
|
170 |
+
print("✓ Statistics logging working correctly")
|
171 |
+
|
172 |
+
print("✅ Result statistics test passed")
|
173 |
+
|
174 |
+
def main():
|
175 |
+
"""Run all retrieval system tests"""
|
176 |
+
print("\n" + "="*60)
|
177 |
+
print("COMPREHENSIVE RETRIEVAL SYSTEM TEST SUITE")
|
178 |
+
print("="*60)
|
179 |
+
|
180 |
+
test = TestRetrievalSystem()
|
181 |
+
|
182 |
+
try:
|
183 |
+
test.setup_class()
|
184 |
+
test.test_system_initialization()
|
185 |
+
test.test_basic_search_functionality()
|
186 |
+
test.test_deduplication_logic()
|
187 |
+
test.test_result_statistics()
|
188 |
+
|
189 |
+
print("\n" + "="*60)
|
190 |
+
print("🎉 ALL RETRIEVAL SYSTEM TESTS COMPLETED SUCCESSFULLY!")
|
191 |
+
print("="*60)
|
192 |
+
print("✅ System initialization validated")
|
193 |
+
print("✅ Basic search functionality confirmed")
|
194 |
+
print("✅ Text-based deduplication working")
|
195 |
+
print("✅ Result statistics and logging verified")
|
196 |
+
print("="*60)
|
197 |
+
|
198 |
+
except Exception as e:
|
199 |
+
print("\n" + "="*60)
|
200 |
+
print("❌ RETRIEVAL SYSTEM TESTS FAILED!")
|
201 |
+
print(f"Error: {str(e)}")
|
202 |
+
print("="*60)
|
203 |
+
raise
|
204 |
+
|
205 |
+
if __name__ == "__main__":
|
206 |
+
main()
|
tests/test_user_prompt.py
ADDED
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
User Prompt Processor Test Suite
|
3 |
+
|
4 |
+
Comprehensive unit tests for UserPromptProcessor class
|
5 |
+
Ensures robust functionality across medical query scenarios.
|
6 |
+
"""
|
7 |
+
|
8 |
+
import pytest
|
9 |
+
import sys
|
10 |
+
from pathlib import Path
|
11 |
+
|
12 |
+
# Dynamically add project root to Python path
|
13 |
+
project_root = Path(__file__).parent.parent
|
14 |
+
sys.path.insert(0, str(project_root / "src"))
|
15 |
+
|
16 |
+
from user_prompt import UserPromptProcessor
|
17 |
+
|
18 |
+
class TestUserPromptProcessor:
|
19 |
+
"""Test suite for UserPromptProcessor functionality"""
|
20 |
+
|
21 |
+
def setup_method(self):
|
22 |
+
"""Initialize test environment before each test method"""
|
23 |
+
self.processor = UserPromptProcessor()
|
24 |
+
|
25 |
+
def test_extract_condition_keywords_predefined(self):
|
26 |
+
"""Test predefined condition extraction"""
|
27 |
+
query = "heart attack symptoms"
|
28 |
+
result = self.processor.extract_condition_keywords(query)
|
29 |
+
|
30 |
+
assert result is not None
|
31 |
+
assert 'condition' in result
|
32 |
+
assert 'emergency_keywords' in result
|
33 |
+
assert 'treatment_keywords' in result
|
34 |
+
|
35 |
+
def test_handle_matching_failure_level1(self):
|
36 |
+
"""Test loose keyword matching mechanism"""
|
37 |
+
test_queries = [
|
38 |
+
"urgent medical help",
|
39 |
+
"critical condition",
|
40 |
+
"severe symptoms"
|
41 |
+
]
|
42 |
+
|
43 |
+
for query in test_queries:
|
44 |
+
result = self.processor._handle_matching_failure_level1(query)
|
45 |
+
|
46 |
+
assert result is not None
|
47 |
+
assert result['type'] == 'loose_keyword_match'
|
48 |
+
assert result['confidence'] == 0.5
|
49 |
+
|
50 |
+
def test_semantic_search_fallback(self):
|
51 |
+
"""Verify semantic search fallback mechanism"""
|
52 |
+
test_queries = [
|
53 |
+
"how to manage chest pain",
|
54 |
+
"treatment for acute stroke",
|
55 |
+
"emergency cardiac care"
|
56 |
+
]
|
57 |
+
|
58 |
+
for query in test_queries:
|
59 |
+
result = self.processor._semantic_search_fallback(query)
|
60 |
+
|
61 |
+
# Result can be None if no match found
|
62 |
+
if result is not None:
|
63 |
+
assert 'condition' in result
|
64 |
+
assert 'emergency_keywords' in result
|
65 |
+
assert 'treatment_keywords' in result
|
66 |
+
|
67 |
+
def test_validate_keywords(self):
|
68 |
+
"""Test keyword validation functionality"""
|
69 |
+
valid_keywords = {
|
70 |
+
'emergency_keywords': 'urgent|critical',
|
71 |
+
'treatment_keywords': 'medication|therapy'
|
72 |
+
}
|
73 |
+
|
74 |
+
invalid_keywords = {
|
75 |
+
'emergency_keywords': '',
|
76 |
+
'treatment_keywords': ''
|
77 |
+
}
|
78 |
+
|
79 |
+
assert self.processor.validate_keywords(valid_keywords) is True
|
80 |
+
assert self.processor.validate_keywords(invalid_keywords) is False
|
81 |
+
|
82 |
+
def main():
|
83 |
+
"""Run comprehensive test suite with detailed reporting"""
|
84 |
+
print("\n" + "="*60)
|
85 |
+
print("OnCall.ai: User Prompt Processor Test Suite")
|
86 |
+
print("="*60)
|
87 |
+
|
88 |
+
# Run pytest with verbose output
|
89 |
+
pytest.main([__file__, '-v', '--tb=short'])
|
90 |
+
|
91 |
+
if __name__ == "__main__":
|
92 |
+
main()
|
tests/test_userinput_userprompt_medical_condition_llm_retrieval.py
ADDED
@@ -0,0 +1,479 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
"""
|
3 |
+
Comprehensive Test Suite for OnCall.ai Medical Query Processing Pipeline
|
4 |
+
|
5 |
+
This test validates the complete flow:
|
6 |
+
User Input → UserPrompt Processing → Medical Condition Extraction → LLM Analysis → Retrieval
|
7 |
+
|
8 |
+
Test Components:
|
9 |
+
- UserPromptProcessor (condition extraction, keyword mapping)
|
10 |
+
- MedicalConditions (predefined mappings, validation)
|
11 |
+
- LLM Client (Llama3-Med42-70B condition extraction)
|
12 |
+
- BasicRetrievalSystem (vector search, result processing)
|
13 |
+
|
14 |
+
Author: OnCall.ai Team
|
15 |
+
Date: 2025-07-30
|
16 |
+
"""
|
17 |
+
|
18 |
+
import sys
|
19 |
+
import os
|
20 |
+
from pathlib import Path
|
21 |
+
import logging
|
22 |
+
import json
|
23 |
+
import traceback
|
24 |
+
from datetime import datetime
|
25 |
+
from typing import Dict, List, Any
|
26 |
+
|
27 |
+
# Add src directory to Python path
|
28 |
+
current_dir = Path(__file__).parent
|
29 |
+
project_root = current_dir.parent
|
30 |
+
src_dir = project_root / "src"
|
31 |
+
sys.path.insert(0, str(src_dir))
|
32 |
+
|
33 |
+
# Import our modules
|
34 |
+
try:
|
35 |
+
from user_prompt import UserPromptProcessor
|
36 |
+
from retrieval import BasicRetrievalSystem
|
37 |
+
from llm_clients import llm_Med42_70BClient
|
38 |
+
from medical_conditions import CONDITION_KEYWORD_MAPPING, validate_condition, get_condition_details
|
39 |
+
except ImportError as e:
|
40 |
+
print(f"❌ Import Error: {e}")
|
41 |
+
print(f"Current working directory: {os.getcwd()}")
|
42 |
+
print(f"Python path: {sys.path}")
|
43 |
+
sys.exit(1)
|
44 |
+
|
45 |
+
# Configure comprehensive logging
|
46 |
+
logging.basicConfig(
|
47 |
+
level=logging.INFO,
|
48 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
49 |
+
handlers=[
|
50 |
+
logging.StreamHandler(),
|
51 |
+
logging.FileHandler(project_root / 'tests' / 'pipeline_test.log')
|
52 |
+
]
|
53 |
+
)
|
54 |
+
logger = logging.getLogger(__name__)
|
55 |
+
|
56 |
+
class MedicalQueryPipelineTest:
|
57 |
+
"""Comprehensive test suite for the medical query processing pipeline"""
|
58 |
+
|
59 |
+
def __init__(self):
|
60 |
+
"""Initialize test suite with all required components"""
|
61 |
+
self.start_time = datetime.now()
|
62 |
+
self.results = []
|
63 |
+
self.components_initialized = False
|
64 |
+
|
65 |
+
# Component references
|
66 |
+
self.llm_client = None
|
67 |
+
self.retrieval_system = None
|
68 |
+
self.user_prompt_processor = None
|
69 |
+
|
70 |
+
def initialize_components(self):
|
71 |
+
"""Initialize all pipeline components with error handling"""
|
72 |
+
print("🔧 Initializing Pipeline Components...")
|
73 |
+
print("-" * 50)
|
74 |
+
|
75 |
+
try:
|
76 |
+
# Initialize LLM client
|
77 |
+
print("1. Initializing Llama3-Med42-70B Client...")
|
78 |
+
self.llm_client = llm_Med42_70BClient()
|
79 |
+
print(" ✅ LLM client initialized successfully")
|
80 |
+
|
81 |
+
# Initialize retrieval system
|
82 |
+
print("2. Initializing Retrieval System...")
|
83 |
+
self.retrieval_system = BasicRetrievalSystem()
|
84 |
+
print(" ✅ Retrieval system initialized successfully")
|
85 |
+
|
86 |
+
# Initialize user prompt processor
|
87 |
+
print("3. Initializing User Prompt Processor...")
|
88 |
+
self.user_prompt_processor = UserPromptProcessor(
|
89 |
+
llm_client=self.llm_client,
|
90 |
+
retrieval_system=self.retrieval_system
|
91 |
+
)
|
92 |
+
print(" ✅ User prompt processor initialized successfully")
|
93 |
+
|
94 |
+
self.components_initialized = True
|
95 |
+
print("\n🎉 All components initialized successfully!")
|
96 |
+
|
97 |
+
except Exception as e:
|
98 |
+
logger.error(f"Component initialization failed: {e}")
|
99 |
+
print(f"❌ Component initialization failed: {e}")
|
100 |
+
traceback.print_exc()
|
101 |
+
self.components_initialized = False
|
102 |
+
|
103 |
+
def get_test_queries(self) -> List[Dict[str, Any]]:
|
104 |
+
"""Define comprehensive test queries with expected behavior"""
|
105 |
+
return [
|
106 |
+
{
|
107 |
+
"id": "test_001",
|
108 |
+
"query": "how to treat acute MI?",
|
109 |
+
"description": "Classic acute myocardial infarction query",
|
110 |
+
"expected_condition": "acute myocardial infarction",
|
111 |
+
"expected_mechanism": "predefined_mapping",
|
112 |
+
"category": "cardiac_emergency"
|
113 |
+
},
|
114 |
+
{
|
115 |
+
"id": "test_002",
|
116 |
+
"query": "patient with severe chest pain and shortness of breath",
|
117 |
+
"description": "Symptoms-based query requiring LLM analysis",
|
118 |
+
"expected_condition": ["acute myocardial infarction", "pulmonary embolism", "acute coronary syndrome"],
|
119 |
+
"expected_mechanism": "llm_extraction",
|
120 |
+
"category": "cardiac_pulmonary"
|
121 |
+
},
|
122 |
+
{
|
123 |
+
"id": "test_003",
|
124 |
+
"query": "sudden neurological symptoms suggesting stroke",
|
125 |
+
"description": "Neurological emergency query",
|
126 |
+
"expected_condition": "acute stroke",
|
127 |
+
"expected_mechanism": "predefined_mapping",
|
128 |
+
"category": "neurological_emergency"
|
129 |
+
},
|
130 |
+
{
|
131 |
+
"id": "test_004",
|
132 |
+
"query": "acute stroke management protocol",
|
133 |
+
"description": "Protocol-specific stroke query",
|
134 |
+
"expected_condition": "acute stroke",
|
135 |
+
"expected_mechanism": "predefined_mapping",
|
136 |
+
"category": "neurological_protocol"
|
137 |
+
},
|
138 |
+
{
|
139 |
+
"id": "test_005",
|
140 |
+
"query": "patient presenting with acute abdominal pain",
|
141 |
+
"description": "General symptom requiring LLM analysis",
|
142 |
+
"expected_condition": "unknown",
|
143 |
+
"expected_mechanism": "semantic_fallback",
|
144 |
+
"category": "general_symptom"
|
145 |
+
},
|
146 |
+
{
|
147 |
+
"id": "test_006",
|
148 |
+
"query": "pulmonary embolism treatment guidelines",
|
149 |
+
"description": "Specific condition with treatment focus",
|
150 |
+
"expected_condition": "pulmonary embolism",
|
151 |
+
"expected_mechanism": "predefined_mapping",
|
152 |
+
"category": "pulmonary_emergency"
|
153 |
+
}
|
154 |
+
]
|
155 |
+
|
156 |
+
def run_single_test(self, test_case: Dict[str, Any]) -> Dict[str, Any]:
|
157 |
+
"""Execute a single test case with comprehensive analysis"""
|
158 |
+
test_id = test_case["id"]
|
159 |
+
query = test_case["query"]
|
160 |
+
|
161 |
+
print(f"\n🔍 {test_id}: {test_case['description']}")
|
162 |
+
print(f"Query: '{query}'")
|
163 |
+
print("-" * 60)
|
164 |
+
|
165 |
+
result = {
|
166 |
+
"test_id": test_id,
|
167 |
+
"test_case": test_case,
|
168 |
+
"timestamp": datetime.now().isoformat(),
|
169 |
+
"success": False,
|
170 |
+
"error": None,
|
171 |
+
"execution_time": 0,
|
172 |
+
"steps": {}
|
173 |
+
}
|
174 |
+
|
175 |
+
start_time = datetime.now()
|
176 |
+
|
177 |
+
try:
|
178 |
+
# Step 1: Condition Extraction
|
179 |
+
print("Step 1: Extracting medical condition and keywords...")
|
180 |
+
condition_start = datetime.now()
|
181 |
+
|
182 |
+
condition_result = self.user_prompt_processor.extract_condition_keywords(query)
|
183 |
+
condition_time = (datetime.now() - condition_start).total_seconds()
|
184 |
+
|
185 |
+
result["steps"]["condition_extraction"] = {
|
186 |
+
"duration_seconds": condition_time,
|
187 |
+
"condition": condition_result.get('condition', ''),
|
188 |
+
"emergency_keywords": condition_result.get('emergency_keywords', ''),
|
189 |
+
"treatment_keywords": condition_result.get('treatment_keywords', ''),
|
190 |
+
"confidence": condition_result.get('confidence', 'unknown'),
|
191 |
+
"source": self._determine_extraction_source(condition_result)
|
192 |
+
}
|
193 |
+
|
194 |
+
print(f" Condition: {condition_result.get('condition', 'None')}")
|
195 |
+
print(f" Emergency keywords: {condition_result.get('emergency_keywords', 'None')}")
|
196 |
+
print(f" Treatment keywords: {condition_result.get('treatment_keywords', 'None')}")
|
197 |
+
print(f" Source: {result['steps']['condition_extraction']['source']}")
|
198 |
+
print(f" Duration: {condition_time:.3f}s")
|
199 |
+
|
200 |
+
# Step 2: User Confirmation (Simulated)
|
201 |
+
print("\nStep 2: User confirmation process...")
|
202 |
+
confirmation_result = self.user_prompt_processor.handle_user_confirmation(condition_result)
|
203 |
+
|
204 |
+
result["steps"]["user_confirmation"] = {
|
205 |
+
"confirmation_type": confirmation_result.get('type', 'unknown'),
|
206 |
+
"message_length": len(confirmation_result.get('message', '')),
|
207 |
+
"actionable": confirmation_result.get('type') == 'confirmation_needed'
|
208 |
+
}
|
209 |
+
|
210 |
+
print(f" Confirmation type: {confirmation_result.get('type', 'Unknown')}")
|
211 |
+
|
212 |
+
# Step 3: Retrieval Execution
|
213 |
+
if condition_result.get('condition'):
|
214 |
+
print("\nStep 3: Executing retrieval...")
|
215 |
+
retrieval_start = datetime.now()
|
216 |
+
|
217 |
+
# Construct search query
|
218 |
+
search_query = self._construct_search_query(condition_result)
|
219 |
+
|
220 |
+
# Perform retrieval
|
221 |
+
retrieval_results = self.retrieval_system.search(search_query, top_k=5)
|
222 |
+
retrieval_time = (datetime.now() - retrieval_start).total_seconds()
|
223 |
+
|
224 |
+
# Correctly count emergency and treatment results from processed_results
|
225 |
+
processed_results = retrieval_results.get('processed_results', [])
|
226 |
+
emergency_count = len([r for r in processed_results if r.get('type') == 'emergency'])
|
227 |
+
treatment_count = len([r for r in processed_results if r.get('type') == 'treatment'])
|
228 |
+
|
229 |
+
result["steps"]["retrieval"] = {
|
230 |
+
"duration_seconds": retrieval_time,
|
231 |
+
"search_query": search_query,
|
232 |
+
"total_results": retrieval_results.get('total_results', 0),
|
233 |
+
"emergency_results": emergency_count,
|
234 |
+
"treatment_results": treatment_count,
|
235 |
+
"processed_results": len(processed_results),
|
236 |
+
"duplicates_removed": retrieval_results.get('processing_info', {}).get('duplicates_removed', 0)
|
237 |
+
}
|
238 |
+
|
239 |
+
print(f" Search query: '{search_query}'")
|
240 |
+
print(f" Total results: {result['steps']['retrieval']['total_results']}")
|
241 |
+
print(f" Emergency results: {emergency_count}")
|
242 |
+
print(f" Treatment results: {treatment_count}")
|
243 |
+
print(f" Duration: {retrieval_time:.3f}s")
|
244 |
+
|
245 |
+
# Analyze top results
|
246 |
+
if 'processed_results' in retrieval_results and retrieval_results['processed_results']:
|
247 |
+
top_results = retrieval_results['processed_results'][:3]
|
248 |
+
result["steps"]["top_results_analysis"] = []
|
249 |
+
|
250 |
+
print(f"\n Top {len(top_results)} results:")
|
251 |
+
for i, res in enumerate(top_results, 1):
|
252 |
+
analysis = {
|
253 |
+
"rank": i,
|
254 |
+
"type": res.get('type', 'unknown'),
|
255 |
+
"distance": res.get('distance', 999),
|
256 |
+
"text_length": len(res.get('text', '')),
|
257 |
+
"has_matched_keywords": bool(res.get('matched', '')),
|
258 |
+
"has_treatment_keywords": bool(res.get('matched_treatment', ''))
|
259 |
+
}
|
260 |
+
result["steps"]["top_results_analysis"].append(analysis)
|
261 |
+
|
262 |
+
print(f" {i}. Type: {analysis['type']}, Distance: {analysis['distance']:.4f}")
|
263 |
+
print(f" Text preview: {res.get('text', '')[:100]}...")
|
264 |
+
if res.get('matched'):
|
265 |
+
print(f" Matched: {res.get('matched')}")
|
266 |
+
if res.get('matched_treatment'):
|
267 |
+
print(f" Treatment: {res.get('matched_treatment')}")
|
268 |
+
|
269 |
+
else:
|
270 |
+
print("\nStep 3: Skipping retrieval (no condition extracted)")
|
271 |
+
result["steps"]["retrieval"] = {
|
272 |
+
"skipped": True,
|
273 |
+
"reason": "no_condition_extracted"
|
274 |
+
}
|
275 |
+
|
276 |
+
# Calculate total execution time
|
277 |
+
total_time = (datetime.now() - start_time).total_seconds()
|
278 |
+
result["execution_time"] = total_time
|
279 |
+
result["success"] = True
|
280 |
+
|
281 |
+
print(f"\n✅ Test {test_id} completed successfully ({total_time:.3f}s)")
|
282 |
+
|
283 |
+
except Exception as e:
|
284 |
+
total_time = (datetime.now() - start_time).total_seconds()
|
285 |
+
result["execution_time"] = total_time
|
286 |
+
result["error"] = str(e)
|
287 |
+
result["traceback"] = traceback.format_exc()
|
288 |
+
|
289 |
+
logger.error(f"Test {test_id} failed: {e}")
|
290 |
+
print(f"❌ Test {test_id} failed: {e}")
|
291 |
+
|
292 |
+
return result
|
293 |
+
|
294 |
+
def _determine_extraction_source(self, condition_result: Dict) -> str:
|
295 |
+
"""Determine how the condition was extracted"""
|
296 |
+
if condition_result.get('semantic_confidence') is not None:
|
297 |
+
return "semantic_search"
|
298 |
+
elif condition_result.get('generic_confidence') is not None:
|
299 |
+
return "generic_search"
|
300 |
+
elif condition_result.get('condition') in CONDITION_KEYWORD_MAPPING:
|
301 |
+
return "predefined_mapping"
|
302 |
+
else:
|
303 |
+
return "llm_extraction"
|
304 |
+
|
305 |
+
def _construct_search_query(self, condition_result: Dict) -> str:
|
306 |
+
"""Construct search query from condition result"""
|
307 |
+
emergency_kws = condition_result.get('emergency_keywords', '')
|
308 |
+
treatment_kws = condition_result.get('treatment_keywords', '')
|
309 |
+
|
310 |
+
search_parts = []
|
311 |
+
if emergency_kws:
|
312 |
+
search_parts.append(emergency_kws)
|
313 |
+
if treatment_kws:
|
314 |
+
search_parts.append(treatment_kws)
|
315 |
+
|
316 |
+
if search_parts:
|
317 |
+
return ' '.join(search_parts)
|
318 |
+
else:
|
319 |
+
return condition_result.get('condition', 'medical emergency')
|
320 |
+
|
321 |
+
def run_all_tests(self):
|
322 |
+
"""Execute all test cases and generate comprehensive report"""
|
323 |
+
if not self.components_initialized:
|
324 |
+
print("❌ Cannot run tests: components not initialized")
|
325 |
+
return
|
326 |
+
|
327 |
+
test_cases = self.get_test_queries()
|
328 |
+
|
329 |
+
print(f"\n🚀 Starting Comprehensive Pipeline Test")
|
330 |
+
print(f"Total test cases: {len(test_cases)}")
|
331 |
+
print(f"Test started at: {self.start_time.strftime('%Y-%m-%d %H:%M:%S')}")
|
332 |
+
print("=" * 80)
|
333 |
+
|
334 |
+
# Execute all tests
|
335 |
+
for test_case in test_cases:
|
336 |
+
result = self.run_single_test(test_case)
|
337 |
+
self.results.append(result)
|
338 |
+
|
339 |
+
# Generate comprehensive report
|
340 |
+
self.generate_test_report()
|
341 |
+
self.save_test_results()
|
342 |
+
|
343 |
+
def generate_test_report(self):
|
344 |
+
"""Generate detailed test report with statistics and analysis"""
|
345 |
+
end_time = datetime.now()
|
346 |
+
total_duration = (end_time - self.start_time).total_seconds()
|
347 |
+
|
348 |
+
successful_tests = [r for r in self.results if r['success']]
|
349 |
+
failed_tests = [r for r in self.results if not r['success']]
|
350 |
+
|
351 |
+
print("\n" + "=" * 80)
|
352 |
+
print("📊 COMPREHENSIVE TEST REPORT")
|
353 |
+
print("=" * 80)
|
354 |
+
|
355 |
+
# Summary Statistics
|
356 |
+
print(f"🕐 Execution Summary:")
|
357 |
+
print(f" Start time: {self.start_time.strftime('%Y-%m-%d %H:%M:%S')}")
|
358 |
+
print(f" End time: {end_time.strftime('%Y-%m-%d %H:%M:%S')}")
|
359 |
+
print(f" Total duration: {total_duration:.3f}s")
|
360 |
+
print(f" Average per test: {total_duration/len(self.results):.3f}s")
|
361 |
+
|
362 |
+
print(f"\n📈 Test Results:")
|
363 |
+
print(f" Total tests: {len(self.results)}")
|
364 |
+
print(f" Successful: {len(successful_tests)} ✅")
|
365 |
+
print(f" Failed: {len(failed_tests)} ❌")
|
366 |
+
print(f" Success rate: {len(successful_tests)/len(self.results)*100:.1f}%")
|
367 |
+
|
368 |
+
# Detailed Analysis
|
369 |
+
if successful_tests:
|
370 |
+
print(f"\n✅ Successful Tests Analysis:")
|
371 |
+
|
372 |
+
# Analyze extraction sources
|
373 |
+
source_counts = {}
|
374 |
+
total_retrieval_time = 0
|
375 |
+
total_condition_time = 0
|
376 |
+
retrieval_count = 0
|
377 |
+
|
378 |
+
for result in successful_tests:
|
379 |
+
if 'condition_extraction' in result['steps']:
|
380 |
+
source = result['steps']['condition_extraction']['source']
|
381 |
+
source_counts[source] = source_counts.get(source, 0) + 1
|
382 |
+
total_condition_time += result['steps']['condition_extraction']['duration_seconds']
|
383 |
+
|
384 |
+
if 'retrieval' in result['steps'] and not result['steps']['retrieval'].get('skipped'):
|
385 |
+
total_retrieval_time += result['steps']['retrieval']['duration_seconds']
|
386 |
+
retrieval_count += 1
|
387 |
+
|
388 |
+
print(f" Condition extraction sources:")
|
389 |
+
for source, count in source_counts.items():
|
390 |
+
print(f" - {source}: {count} tests")
|
391 |
+
|
392 |
+
print(f" Performance metrics:")
|
393 |
+
print(f" - Avg condition extraction: {total_condition_time/len(successful_tests):.3f}s")
|
394 |
+
if retrieval_count > 0:
|
395 |
+
print(f" - Avg retrieval time: {total_retrieval_time/retrieval_count:.3f}s")
|
396 |
+
|
397 |
+
# Individual test details
|
398 |
+
for result in successful_tests:
|
399 |
+
test_case = result['test_case']
|
400 |
+
print(f"\n 📋 {result['test_id']}: {test_case['description']}")
|
401 |
+
print(f" Query: '{test_case['query']}'")
|
402 |
+
|
403 |
+
if 'condition_extraction' in result['steps']:
|
404 |
+
ce = result['steps']['condition_extraction']
|
405 |
+
print(f" Condition: {ce['condition']}")
|
406 |
+
print(f" Source: {ce['source']}")
|
407 |
+
|
408 |
+
if 'retrieval' in result['steps'] and not result['steps']['retrieval'].get('skipped'):
|
409 |
+
ret = result['steps']['retrieval']
|
410 |
+
print(f" Results: {ret['total_results']} total ({ret['emergency_results']} emergency, {ret['treatment_results']} treatment)")
|
411 |
+
|
412 |
+
print(f" Duration: {result['execution_time']:.3f}s")
|
413 |
+
|
414 |
+
# Failed Tests Analysis
|
415 |
+
if failed_tests:
|
416 |
+
print(f"\n❌ Failed Tests Analysis:")
|
417 |
+
for result in failed_tests:
|
418 |
+
test_case = result['test_case']
|
419 |
+
print(f" {result['test_id']}: {test_case['description']}")
|
420 |
+
print(f" Error: {result['error']}")
|
421 |
+
print(f" Duration: {result['execution_time']:.3f}s")
|
422 |
+
|
423 |
+
print("\n" + "=" * 80)
|
424 |
+
|
425 |
+
def save_test_results(self):
|
426 |
+
"""Save detailed test results to JSON file"""
|
427 |
+
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
|
428 |
+
filename = project_root / 'tests' / f'pipeline_test_results_{timestamp}.json'
|
429 |
+
|
430 |
+
try:
|
431 |
+
comprehensive_results = {
|
432 |
+
"test_metadata": {
|
433 |
+
"timestamp": datetime.now().isoformat(),
|
434 |
+
"start_time": self.start_time.isoformat(),
|
435 |
+
"total_duration_seconds": (datetime.now() - self.start_time).total_seconds(),
|
436 |
+
"total_tests": len(self.results),
|
437 |
+
"successful_tests": len([r for r in self.results if r['success']]),
|
438 |
+
"failed_tests": len([r for r in self.results if not r['success']])
|
439 |
+
},
|
440 |
+
"test_results": self.results,
|
441 |
+
"component_versions": {
|
442 |
+
"user_prompt_processor": "1.0.0",
|
443 |
+
"retrieval_system": "1.0.0",
|
444 |
+
"llm_client": "1.0.0"
|
445 |
+
}
|
446 |
+
}
|
447 |
+
|
448 |
+
with open(filename, 'w', encoding='utf-8') as f:
|
449 |
+
json.dump(comprehensive_results, f, indent=2, ensure_ascii=False)
|
450 |
+
|
451 |
+
print(f"📁 Comprehensive test results saved to: {filename}")
|
452 |
+
|
453 |
+
except Exception as e:
|
454 |
+
logger.error(f"Failed to save test results: {e}")
|
455 |
+
print(f"⚠️ Failed to save test results: {e}")
|
456 |
+
|
457 |
+
def main():
|
458 |
+
"""Main execution function"""
|
459 |
+
print("🏥 OnCall.ai Medical Query Processing Pipeline Test")
|
460 |
+
print("=" * 60)
|
461 |
+
|
462 |
+
# Initialize test suite
|
463 |
+
test_suite = MedicalQueryPipelineTest()
|
464 |
+
|
465 |
+
# Initialize components
|
466 |
+
test_suite.initialize_components()
|
467 |
+
|
468 |
+
if not test_suite.components_initialized:
|
469 |
+
print("❌ Test suite initialization failed. Exiting.")
|
470 |
+
return 1
|
471 |
+
|
472 |
+
# Run all tests
|
473 |
+
test_suite.run_all_tests()
|
474 |
+
|
475 |
+
return 0
|
476 |
+
|
477 |
+
if __name__ == "__main__":
|
478 |
+
exit_code = main()
|
479 |
+
sys.exit(exit_code)
|