Yan-Bo Chen commited on
Commit
5888ce4
·
2 Parent(s): b85d6ac c5edc14

Merge pull request #3 from YanBoChen0928/generation

Browse files

Generation (complete user_input-> user_prompt (llm_client,medical_condition) ->retrieval.py->generation.py process)

.env.example ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ # .env.example document
2
+ HF_TOKEN=your_huggingface_token_here
.gitignore CHANGED
@@ -1,17 +1,27 @@
1
  # 🧠 Virtual environments
2
  genAIvenv/
3
  .final_project_env/
 
 
 
 
4
 
5
  # 💻 OS / Editor garbage
6
  .DS_Store
7
  .vscode/
 
 
 
 
8
 
9
  # 📁 Documentation and project folders
10
  docs/
11
  dataset/dataset/
 
12
 
13
  # 🧾 Compiled / output files
14
  *.pyc
 
15
  *.log
16
  *.zip
17
  *.tar.gz
@@ -20,6 +30,15 @@ dataset/dataset/
20
  *.json
21
  *.png
22
 
 
 
 
 
 
 
 
 
 
23
  # 🚫 Large files - models
24
  models/cache/
25
  models/cache/*.pkl
@@ -32,3 +51,15 @@ models/indices/annoy/*.ann
32
  *.pkl
33
  *.npy
34
  *.ann
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # 🧠 Virtual environments
2
  genAIvenv/
3
  .final_project_env/
4
+ .env
5
+ .venv
6
+ env/
7
+ venv/
8
 
9
  # 💻 OS / Editor garbage
10
  .DS_Store
11
  .vscode/
12
+ *.swp
13
+ *~
14
+ .idea/
15
+ *.iml
16
 
17
  # 📁 Documentation and project folders
18
  docs/
19
  dataset/dataset/
20
+ cache/
21
 
22
  # 🧾 Compiled / output files
23
  *.pyc
24
+ __pycache__/
25
  *.log
26
  *.zip
27
  *.tar.gz
 
30
  *.json
31
  *.png
32
 
33
+ # 🔑 Secrets and configs
34
+ .env
35
+ .env.local
36
+ .env.*.local
37
+ *.pem
38
+ credentials.json
39
+ token.json
40
+ *.mdc
41
+
42
  # 🚫 Large files - models
43
  models/cache/
44
  models/cache/*.pkl
 
51
  *.pkl
52
  *.npy
53
  *.ann
54
+
55
+ # 📊 Jupyter Notebook
56
+ .ipynb_checkpoints
57
+ */.ipynb_checkpoints/*
58
+ *.ipynb_checkpoints*
59
+
60
+ # 📝 Coverage reports
61
+ htmlcov/
62
+ .coverage
63
+ .coverage.*
64
+ coverage.xml
65
+ *.cover
requirements.txt CHANGED
@@ -32,13 +32,16 @@ huggingface-hub==0.33.4
32
  idna==3.10
33
  Jinja2==3.1.6
34
  jiter==0.10.0
 
35
  kiwisolver==1.4.8
36
  markdown-it-py==3.0.0
37
  MarkupSafe==3.0.2
38
  matplotlib==3.10.3
39
  mdurl==0.1.2
 
40
  multidict==6.6.3
41
  multiprocess==0.70.16
 
42
  numpy==2.3.1
43
  openai==1.97.0
44
  orjson==3.11.0
@@ -53,6 +56,7 @@ pydub==0.25.1
53
  Pygments==2.19.2
54
  pyparsing==3.2.3
55
  python-dateutil==2.9.0.post0
 
56
  python-multipart==0.0.20
57
  pytz==2025.2
58
  PyYAML==6.0.2
@@ -62,6 +66,8 @@ rich==14.0.0
62
  ruff==0.12.4
63
  safehttpx==0.1.6
64
  safetensors==0.5.3
 
 
65
  seaborn==0.13.2
66
  semantic-version==2.10.0
67
  sentence-transformers==3.0.1
@@ -69,8 +75,11 @@ shellingham==1.5.4
69
  six==1.17.0
70
  sniffio==1.3.1
71
  starlette==0.47.2
 
 
72
  tokenizers==0.21.2
73
  tomlkit==0.13.3
 
74
  tqdm==4.67.1
75
  transformers==4.53.2
76
  typer==0.16.0
 
32
  idna==3.10
33
  Jinja2==3.1.6
34
  jiter==0.10.0
35
+ joblib==1.5.1
36
  kiwisolver==1.4.8
37
  markdown-it-py==3.0.0
38
  MarkupSafe==3.0.2
39
  matplotlib==3.10.3
40
  mdurl==0.1.2
41
+ mpmath==1.3.0
42
  multidict==6.6.3
43
  multiprocess==0.70.16
44
+ networkx==3.5
45
  numpy==2.3.1
46
  openai==1.97.0
47
  orjson==3.11.0
 
56
  Pygments==2.19.2
57
  pyparsing==3.2.3
58
  python-dateutil==2.9.0.post0
59
+ python-dotenv==1.1.1
60
  python-multipart==0.0.20
61
  pytz==2025.2
62
  PyYAML==6.0.2
 
66
  ruff==0.12.4
67
  safehttpx==0.1.6
68
  safetensors==0.5.3
69
+ scikit-learn==1.7.1
70
+ scipy==1.16.1
71
  seaborn==0.13.2
72
  semantic-version==2.10.0
73
  sentence-transformers==3.0.1
 
75
  six==1.17.0
76
  sniffio==1.3.1
77
  starlette==0.47.2
78
+ sympy==1.14.0
79
+ threadpoolctl==3.6.0
80
  tokenizers==0.21.2
81
  tomlkit==0.13.3
82
+ torch==2.7.1
83
  tqdm==4.67.1
84
  transformers==4.53.2
85
  typer==0.16.0
src/Todo_20250731_Multlevel_Fallback_Fix.md ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 太棒了!讓我們一起分析這個詳細的測試結果,這些數據很有價值!
2
+
3
+ ## 📊 **測試結果深度分析**
4
+
5
+ ### **✅ 成功的部分 (53.8% 成功率)**
6
+
7
+ **Level 1 - 預定義映射**: 💯 **完美表現**
8
+ - 3/3 測試通過,平均 0.000s(瞬間響應)
9
+ - `acute myocardial infarction`, `acute stroke`, `pulmonary embolism` 都直接命中
10
+
11
+ **Level 4b→5 - 冷門醫療查詢**: 💯 **完美表現**
12
+ - 3/3 測試通過,正確進入 generic search
13
+ - 罕見血液疾病、ITP、壞死性筋膜炎都正確處理
14
+
15
+ ### **🔍 發現的關鍵問題**
16
+
17
+ #### **問題1: Level 4 驗證機制失效** ❌
18
+ **現象**: 非醫療查詢(烹飪、編程、天氣)都被當作醫療查詢處理
19
+ ```
20
+ - "how to cook pasta properly?" → Level 5 (應該被拒絕)
21
+ - "programming language" → Level 5 (應該被拒絕)
22
+ - "weather forecast" → Level 5 (應該被拒絕)
23
+ ```
24
+
25
+ **根本原因**: `validate_medical_query` 邏輯有問題
26
+ - LLM 雖然說"這不是醫療查詢",但函數仍然返回 `None`(表示通過驗證)
27
+ - 應該檢查 LLM 回應中是否明確說明"非醫療"
28
+
29
+ #### **問題2: Level 3 語義搜索邏輯問題** ⚠️
30
+ **現象**: 期望 Level 3 的查詢都跳到了 Level 5
31
+ ```
32
+ - "emergency management of cardiovascular crisis" → Level 5 (期望 Level 3)
33
+ - "urgent neurological intervention protocols" → Level 5 (期望 Level 3)
34
+ ```
35
+
36
+ **原因**: `_infer_condition_from_text` 方法可能過於嚴格,無法推斷出有效條件
37
+
38
+ #### **問題3: Level 2 行為不一致** ⚠️
39
+ **現象**:
40
+ - `level2_001` 成功,但被 Level 1 攔截了(LLM 提取了已知條件)
41
+ - `level2_002` 失敗,LLM 提取了條件但驗證失敗
42
+
43
+ ## 🛠️ **需要修正的優先順序**
44
+
45
+ ### **Priority 1: 修正 validate_medical_query**
46
+ ```python
47
+ def validate_medical_query(self, user_query: str) -> Optional[Dict[str, Any]]:
48
+ # 檢查 LLM 回應是否明確說明非醫療
49
+ if llama_result.get('extracted_condition'):
50
+ response_text = llama_result.get('raw_response', '').lower()
51
+
52
+ # 檢查是否明確拒絕醫療查詢
53
+ rejection_phrases = [
54
+ "not a medical condition",
55
+ "outside my medical scope",
56
+ "unrelated to medical conditions",
57
+ "do not address"
58
+ ]
59
+
60
+ if any(phrase in response_text for phrase in rejection_phrases):
61
+ return self._generate_invalid_query_response()
62
+
63
+ return None # 通過驗證
64
+ ```
65
+
66
+ ### **Priority 2: 改進語義搜索條件推斷**
67
+ `_infer_condition_from_text` 的相似度閾值可能太高(0.7),建議降低到 0.5
68
+
69
+ ### **Priority 3: 優化 Level 2 LLM 提取驗證**
70
+ 確保 `validate_condition` 能正確處理 LLM 的複雜回應
71
+
72
+ ## 🎯 **整體評估**
73
+
74
+ ### **速度表現**: ⭐⭐⭐⭐⭐
75
+ - Level 1: 瞬間響應 (0.000s)
76
+ - 平均: 14.4s(主要是 LLM 調用造成的)
77
+
78
+ ### **準確性**: ⭐⭐⭐
79
+ - 預定義條件: 100% 準確
80
+ - 冷門醫療: 100% 準確
81
+ - 非醫療拒絕: 0% 準確 ← **需要立即修正**
82
+
83
+ 你希望我先修正 `validate_medical_query` 的邏輯嗎?這是最關鍵的問題,解決後整體成功率應該能提升到 80%+。
src/__init__.py CHANGED
@@ -3,6 +3,16 @@ OnCall.ai src package
3
 
4
  This package contains the core implementation of the OnCall.ai system.
5
  """
6
-
7
  # Version
8
- __version__ = '0.1.0'
 
 
 
 
 
 
 
 
 
 
 
3
 
4
  This package contains the core implementation of the OnCall.ai system.
5
  """
6
+
7
  # Version
8
+ __version__ = '0.1.0'
9
+
10
+ # import key modules
11
+ from .llm_clients import llm_Med42_70BClient
12
+ from .user_prompt import UserPromptProcessor
13
+ from .retrieval import BasicRetrievalSystem
14
+ from .medical_conditions import (
15
+ CONDITION_KEYWORD_MAPPING,
16
+ get_condition_keywords,
17
+ validate_condition
18
+ )
src/data_processing.py CHANGED
@@ -106,7 +106,7 @@ class DataProcessor:
106
  raise FileNotFoundError(f"Treatment data not found: {treatment_path}")
107
 
108
  # Load data
109
- self.emergency_data = pd.read_json(str(emergency_path), lines=True) # 使用 str() 确保路径正确处理
110
  self.treatment_data = pd.read_json(str(treatment_path), lines=True)
111
 
112
  logger.info(f"Loaded {len(self.emergency_data)} emergency records")
@@ -167,11 +167,8 @@ class DataProcessor:
167
  # Get the keyword text (already lowercase)
168
  actual_keyword = text[keyword_pos:keyword_pos + len(keyword)]
169
 
170
- # Calculate rough window size using dynamic ratio
171
- # Cap the rough chunk target token size to prevent tokenizer warnings
172
- # Use 512 tokens as target (model's max limit)
173
- ROUGH_CHUNK_TARGET_TOKENS = 512
174
- char_window = int(ROUGH_CHUNK_TARGET_TOKENS * chars_per_token / 2)
175
 
176
  # Get rough chunk boundaries in characters
177
  rough_start = max(0, keyword_pos - char_window)
@@ -231,73 +228,119 @@ class DataProcessor:
231
  return chunks
232
 
233
  def create_dual_keyword_chunks(self, text: str, emergency_keywords: str,
234
- treatment_keywords: str, chunk_size: int = 512,
235
  doc_id: str = None) -> List[Dict[str, Any]]:
236
  """
237
  Create chunks for treatment data with both emergency and treatment keywords
 
238
 
239
  Args:
240
  text: Input text
241
- emergency_keywords: Emergency keywords
242
- treatment_keywords: Treatment keywords
243
- chunk_size: Size of each chunk
244
  doc_id: Document ID for tracking
245
 
246
  Returns:
247
- List of chunk dictionaries
248
  """
249
- if not treatment_keywords or pd.isna(treatment_keywords):
250
- return []
251
-
252
  chunks = []
253
- em_keywords = emergency_keywords.split("|") if emergency_keywords else []
254
- tr_keywords = treatment_keywords.split("|") if treatment_keywords else []
255
 
256
- # Process treatment keywords as primary (since this is treatment-focused data)
257
- for i, tr_keyword in enumerate(tr_keywords):
258
- tr_pos = text.lower().find(tr_keyword.lower())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
259
 
260
- if tr_pos != -1:
261
- # Find closest emergency keyword for context
262
- closest_em_keyword = None
263
- closest_distance = float('inf')
 
 
 
264
 
265
- for em_keyword in em_keywords:
266
- em_pos = text.lower().find(em_keyword.lower())
267
- if em_pos != -1:
268
- distance = abs(tr_pos - em_pos)
269
- if distance < closest_distance and distance < chunk_size:
270
- closest_distance = distance
271
- closest_em_keyword = em_keyword
272
 
273
- # Calculate chunk boundaries
274
- if closest_em_keyword:
275
- # Center between both keywords
276
- em_pos = text.lower().find(closest_em_keyword.lower())
277
- center = (tr_pos + em_pos) // 2
278
- else:
279
- # Center on treatment keyword
280
- center = tr_pos
281
 
282
- start = max(0, center - chunk_size // 2)
283
- end = min(len(text), center + chunk_size // 2)
284
-
285
- chunk_text = text[start:end].strip()
 
 
 
 
286
 
287
- if chunk_text:
288
- chunk_info = {
289
- "text": chunk_text,
290
- "primary_keyword": tr_keyword,
291
- "emergency_keywords": emergency_keywords,
292
- "treatment_keywords": treatment_keywords,
293
- "closest_emergency_keyword": closest_em_keyword,
294
- "keyword_distance": closest_distance if closest_em_keyword else None,
295
- "chunk_start": start,
296
- "chunk_end": end,
297
- "chunk_id": f"{doc_id}_treatment_chunk_{i}" if doc_id else f"treatment_chunk_{i}",
298
- "source_doc_id": doc_id
299
- }
300
- chunks.append(chunk_info)
301
 
302
  return chunks
303
 
@@ -308,12 +351,14 @@ class DataProcessor:
308
 
309
  all_chunks = []
310
 
311
- # Add progress bar with leave=False to avoid cluttering
312
  for idx, row in tqdm(self.emergency_data.iterrows(),
313
  total=len(self.emergency_data),
314
- desc="Processing emergency documents",
315
- unit="doc",
316
- leave=False):
 
 
317
  if pd.notna(row.get('clean_text')) and pd.notna(row.get('matched')):
318
  chunks = self.create_keyword_centered_chunks(
319
  text=row['clean_text'],
@@ -345,12 +390,14 @@ class DataProcessor:
345
 
346
  all_chunks = []
347
 
348
- # Add progress bar with leave=False to avoid cluttering
349
  for idx, row in tqdm(self.treatment_data.iterrows(),
350
  total=len(self.treatment_data),
351
- desc="Processing treatment documents",
352
- unit="doc",
353
- leave=False):
 
 
354
  if (pd.notna(row.get('clean_text')) and
355
  pd.notna(row.get('treatment_matched'))):
356
 
@@ -454,10 +501,12 @@ class DataProcessor:
454
  logger.info(f"Processing {len(texts)} new {chunk_type} texts in {total_batches} batches...")
455
 
456
  for i in tqdm(range(0, len(texts), batch_size),
457
- desc=f"Embedding {chunk_type} subset",
458
  total=total_batches,
459
- unit="batch",
460
- leave=False):
 
 
461
  batch_texts = texts[i:i + batch_size]
462
  batch_emb = model.encode(
463
  batch_texts,
 
106
  raise FileNotFoundError(f"Treatment data not found: {treatment_path}")
107
 
108
  # Load data
109
+ self.emergency_data = pd.read_json(str(emergency_path), lines=True) # use str() to ensure path is correct
110
  self.treatment_data = pd.read_json(str(treatment_path), lines=True)
111
 
112
  logger.info(f"Loaded {len(self.emergency_data)} emergency records")
 
167
  # Get the keyword text (already lowercase)
168
  actual_keyword = text[keyword_pos:keyword_pos + len(keyword)]
169
 
170
+ # Calculate rough window size using simple ratio
171
+ char_window = int(chunk_size * chars_per_token / 2)
 
 
 
172
 
173
  # Get rough chunk boundaries in characters
174
  rough_start = max(0, keyword_pos - char_window)
 
228
  return chunks
229
 
230
  def create_dual_keyword_chunks(self, text: str, emergency_keywords: str,
231
+ treatment_keywords: str, chunk_size: int = None,
232
  doc_id: str = None) -> List[Dict[str, Any]]:
233
  """
234
  Create chunks for treatment data with both emergency and treatment keywords
235
+ using token-based separate chunking strategy with enhanced metadata for treatment chunks
236
 
237
  Args:
238
  text: Input text
239
+ emergency_keywords: Emergency keywords (pipe-separated)
240
+ treatment_keywords: Treatment keywords (pipe-separated)
241
+ chunk_size: Size of each chunk in tokens (defaults to self.chunk_size)
242
  doc_id: Document ID for tracking
243
 
244
  Returns:
245
+ List of chunk dictionaries with enhanced metadata for treatment chunks
246
  """
 
 
 
247
  chunks = []
248
+ chunk_size = chunk_size or self.chunk_size
 
249
 
250
+ # Case 1: No keywords present
251
+ if not emergency_keywords and not treatment_keywords:
252
+ return []
253
+
254
+ # Case 2: Only emergency keywords (early return)
255
+ if emergency_keywords and not treatment_keywords:
256
+ em_chunks = self.create_keyword_centered_chunks(
257
+ text=text,
258
+ matched_keywords=emergency_keywords,
259
+ chunk_size=chunk_size,
260
+ doc_id=doc_id
261
+ )
262
+ for chunk in em_chunks:
263
+ chunk['source_type'] = 'emergency'
264
+ return em_chunks
265
+
266
+ # Case 3: Only treatment keywords (early return)
267
+ if treatment_keywords and not emergency_keywords:
268
+ tr_chunks = self.create_keyword_centered_chunks(
269
+ text=text,
270
+ matched_keywords=treatment_keywords,
271
+ chunk_size=chunk_size,
272
+ doc_id=doc_id
273
+ )
274
+ for chunk in tr_chunks:
275
+ chunk['source_type'] = 'treatment'
276
+ chunk['contains_treatment_kws'] = treatment_keywords.split('|')
277
+ chunk['contains_emergency_kws'] = []
278
+ chunk['match_type'] = 'treatment_only'
279
+ return tr_chunks
280
+
281
+ # Case 4: Both keywords present - separate processing
282
+ # Process emergency keywords
283
+ if emergency_keywords:
284
+ em_chunks = self.create_keyword_centered_chunks(
285
+ text=text,
286
+ matched_keywords=emergency_keywords,
287
+ chunk_size=chunk_size,
288
+ doc_id=doc_id
289
+ )
290
+ for chunk in em_chunks:
291
+ chunk['source_type'] = 'emergency'
292
+ chunks.extend(em_chunks)
293
+
294
+ # Process treatment keywords
295
+ if treatment_keywords:
296
+ tr_chunks = self.create_keyword_centered_chunks(
297
+ text=text,
298
+ matched_keywords=treatment_keywords,
299
+ chunk_size=chunk_size,
300
+ doc_id=doc_id
301
+ )
302
 
303
+ # Parse keywords for metadata
304
+ em_kws = emergency_keywords.split('|') if emergency_keywords else []
305
+ tr_kws = treatment_keywords.split('|') if treatment_keywords else []
306
+
307
+ # Add metadata for each treatment chunk
308
+ for i, chunk in enumerate(tr_chunks):
309
+ chunk_text = chunk['text'].lower()
310
 
311
+ # Check for keyword presence in chunk text
312
+ contains_emergency_kws = [
313
+ kw for kw in em_kws if kw.lower() in chunk_text
314
+ ]
315
+ contains_treatment_kws = [
316
+ kw for kw in tr_kws if kw.lower() in chunk_text
317
+ ]
318
 
319
+ # Determine match type based on keyword presence
320
+ has_emergency = len(contains_emergency_kws) > 0
321
+ has_treatment = len(contains_treatment_kws) > 0
 
 
 
 
 
322
 
323
+ if has_emergency and has_treatment:
324
+ match_type = "both"
325
+ elif has_emergency:
326
+ match_type = "emergency_only"
327
+ elif has_treatment:
328
+ match_type = "treatment_only"
329
+ else:
330
+ match_type = "none"
331
 
332
+ # Update chunk metadata
333
+ chunk.update({
334
+ 'source_type': 'treatment',
335
+ 'contains_emergency_kws': contains_emergency_kws,
336
+ 'contains_treatment_kws': contains_treatment_kws,
337
+ 'match_type': match_type,
338
+ 'emergency_keywords': emergency_keywords, # Store original metadata
339
+ 'treatment_keywords': treatment_keywords,
340
+ 'chunk_id': f"{doc_id}_treatment_chunk_{i}" if doc_id else f"treatment_chunk_{i}"
341
+ })
342
+
343
+ chunks.extend(tr_chunks)
 
 
344
 
345
  return chunks
346
 
 
351
 
352
  all_chunks = []
353
 
354
+ # Add simplified progress bar
355
  for idx, row in tqdm(self.emergency_data.iterrows(),
356
  total=len(self.emergency_data),
357
+ desc="Emergency Processing",
358
+ unit="docs",
359
+ leave=True,
360
+ ncols=80,
361
+ mininterval=1.0):
362
  if pd.notna(row.get('clean_text')) and pd.notna(row.get('matched')):
363
  chunks = self.create_keyword_centered_chunks(
364
  text=row['clean_text'],
 
390
 
391
  all_chunks = []
392
 
393
+ # Add simplified progress bar
394
  for idx, row in tqdm(self.treatment_data.iterrows(),
395
  total=len(self.treatment_data),
396
+ desc="Treatment Processing",
397
+ unit="docs",
398
+ leave=True,
399
+ ncols=80,
400
+ mininterval=1.0):
401
  if (pd.notna(row.get('clean_text')) and
402
  pd.notna(row.get('treatment_matched'))):
403
 
 
501
  logger.info(f"Processing {len(texts)} new {chunk_type} texts in {total_batches} batches...")
502
 
503
  for i in tqdm(range(0, len(texts), batch_size),
504
+ desc=f"Embedding {chunk_type}",
505
  total=total_batches,
506
+ unit="batches",
507
+ leave=True,
508
+ ncols=80,
509
+ mininterval=0.5):
510
  batch_texts = texts[i:i + batch_size]
511
  batch_emb = model.encode(
512
  batch_texts,
src/generation.py ADDED
@@ -0,0 +1,519 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ OnCall.ai Medical Advice Generation Module
3
+
4
+ This module handles:
5
+ 1. RAG prompt construction from retrieval results
6
+ 2. Medical advice generation using Med42-70B
7
+ 3. Response formatting and confidence assessment
8
+ 4. Integration with multi-dataset architecture
9
+
10
+ Author: OnCall.ai Team
11
+ Date: 2025-07-31
12
+ """
13
+
14
+ import logging
15
+ from typing import Dict, List, Optional, Any, Union
16
+ from datetime import datetime
17
+ import json
18
+
19
+ # Import existing LLM client
20
+ from llm_clients import llm_Med42_70BClient
21
+
22
+ # Configure logging
23
+ logging.basicConfig(
24
+ level=logging.INFO,
25
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
26
+ )
27
+ logger = logging.getLogger(__name__)
28
+
29
+ class MedicalAdviceGenerator:
30
+ """
31
+ Core generation module for medical advice using RAG approach
32
+ """
33
+
34
+ def __init__(self, llm_client: Optional[llm_Med42_70BClient] = None):
35
+ """
36
+ Initialize medical advice generator
37
+
38
+ Args:
39
+ llm_client: Optional Med42-70B client, creates new if None
40
+ """
41
+ self.llm_client = llm_client or llm_Med42_70BClient()
42
+
43
+ # Dataset source priorities for different intentions
44
+ self.dataset_priorities = {
45
+ "treatment": {
46
+ "emergency_subset": 2,
47
+ "treatment_subset": 4,
48
+ "symptom_subset": 0, # Reserved for Dataset B
49
+ "diagnosis_subset": 0 # Reserved for Dataset B
50
+ },
51
+ "diagnosis": {
52
+ "emergency_subset": 4,
53
+ "treatment_subset": 2,
54
+ "symptom_subset": 0, # Reserved for Dataset B
55
+ "diagnosis_subset": 0 # Reserved for Dataset B
56
+ },
57
+ # "STAT": {
58
+ # # NOTE: Use when query contains urgent indicators like "NOW", "STAT", "critical"
59
+ # "emergency_subset": 5,
60
+ # "treatment_subset": 1,
61
+ # "symptom_subset": 0, # Reserved for Dataset B
62
+ # "diagnosis_subset": 0 # Reserved for Dataset B
63
+ # }
64
+ }
65
+
66
+ logger.info("MedicalAdviceGenerator initialized")
67
+
68
+ def generate_medical_advice(self, user_query: str, retrieval_results: Dict[str, Any],
69
+ intention: Optional[str] = None) -> Dict[str, Any]:
70
+ """
71
+ Complete pipeline: construct prompt → generate advice → format response
72
+
73
+ Args:
74
+ user_query: Original user medical query
75
+ retrieval_results: Results from BasicRetrievalSystem.search()
76
+ intention: Optional query intention ('treatment', 'diagnosis', 'STAT'(tentative))
77
+
78
+ Returns:
79
+ Dict containing formatted medical advice and metadata
80
+ """
81
+ try:
82
+ logger.info(f"Generating medical advice for query: '{user_query[:50]}...'")
83
+ start_time = datetime.now()
84
+
85
+ # Step 1: Extract and classify chunks from retrieval results
86
+ classified_chunks = self._classify_retrieval_chunks(retrieval_results)
87
+
88
+ # Step 2: Build RAG prompt based on intention and chunk classification
89
+ rag_prompt = self.generate_prompt(user_query, classified_chunks, intention)
90
+
91
+ # Step 3: Generate medical advice using Med42-70B
92
+ generation_result = self._generate_with_med42(rag_prompt)
93
+
94
+ # Step 4: Format structured response
95
+ formatted_response = self._format_medical_response(
96
+ user_query=user_query,
97
+ generated_advice=generation_result,
98
+ chunks_used=classified_chunks,
99
+ intention=intention,
100
+ processing_time=(datetime.now() - start_time).total_seconds()
101
+ )
102
+
103
+ processing_duration = formatted_response.get('query_metadata', {}).get('processing_time_seconds', 0)
104
+ logger.info(f"Medical advice generated successfully in {processing_duration:.3f}s")
105
+ return formatted_response
106
+
107
+ except Exception as e:
108
+ logger.error(f"Medical advice generation failed: {e}")
109
+ return self._generate_error_response(user_query, str(e))
110
+
111
+ def generate_prompt(self, user_query: str, classified_chunks: Dict[str, List],
112
+ intention: Optional[str] = None) -> str:
113
+ """
114
+ Enhanced prompt generator with flexible dataset integration
115
+
116
+ Args:
117
+ user_query: User's medical query
118
+ classified_chunks: Chunks classified by dataset source
119
+ intention: Query intention if detected
120
+
121
+ Returns:
122
+ Structured RAG prompt for Med42-70B
123
+ """
124
+ logger.info(f"Generating prompt with intention: {intention}")
125
+
126
+ # Extract chunks by dataset source
127
+ emergency_chunks = classified_chunks.get("emergency_subset", [])
128
+ treatment_chunks = classified_chunks.get("treatment_subset", [])
129
+ symptom_chunks = classified_chunks.get("symptom_subset", []) # Dataset B (future)
130
+ diagnosis_chunks = classified_chunks.get("diagnosis_subset", []) # Dataset B (future)
131
+
132
+ # Select chunks based on intention or intelligent defaults
133
+ selected_chunks = self._select_chunks_by_intention(
134
+ intention=intention,
135
+ emergency_chunks=emergency_chunks,
136
+ treatment_chunks=treatment_chunks,
137
+ symptom_chunks=symptom_chunks,
138
+ diagnosis_chunks=diagnosis_chunks
139
+ )
140
+
141
+ # Build context block from selected chunks
142
+ context_block = self._build_context_block(selected_chunks)
143
+
144
+ # Construct medical RAG prompt
145
+ prompt = self._construct_medical_prompt(user_query, context_block, intention)
146
+
147
+ logger.info(f"Generated prompt with {len(selected_chunks)} chunks, {len(context_block)} chars")
148
+ return prompt
149
+
150
+ def _classify_retrieval_chunks(self, retrieval_results: Dict[str, Any]) -> Dict[str, List]:
151
+ """
152
+ Classify retrieval chunks by dataset source
153
+
154
+ Args:
155
+ retrieval_results: Results from BasicRetrievalSystem.search()
156
+
157
+ Returns:
158
+ Dict mapping dataset sources to chunk lists
159
+ """
160
+ classified = {
161
+ "emergency_subset": [],
162
+ "treatment_subset": [],
163
+ "symptom_subset": [], # Reserved for Dataset B
164
+ "diagnosis_subset": [] # Reserved for Dataset B
165
+ }
166
+
167
+ # Process results from current dual-index system
168
+ processed_results = retrieval_results.get('processed_results', [])
169
+
170
+ for chunk in processed_results:
171
+ chunk_type = chunk.get('type', 'unknown')
172
+
173
+ # Map current system types to dataset sources
174
+ if chunk_type == 'emergency':
175
+ classified["emergency_subset"].append(chunk)
176
+ elif chunk_type == 'treatment':
177
+ classified["treatment_subset"].append(chunk)
178
+ else:
179
+ # Unknown type, classify by content analysis or default to STAT (tentative)
180
+ logger.warning(f"Unknown chunk type: {chunk_type}, defaulting to STAT (tentative)")
181
+ classified["emergency_subset"].append(chunk)
182
+
183
+ # TODO: Future integration point for Dataset B
184
+ # When Dataset B team provides symptom/diagnosis data:
185
+ # classified["symptom_subset"] = process_dataset_b_symptoms(retrieval_results)
186
+ # classified["diagnosis_subset"] = process_dataset_b_diagnosis(retrieval_results)
187
+
188
+ logger.info(f"Classified chunks: Emergency={len(classified['emergency_subset'])}, "
189
+ f"Treatment={len(classified['treatment_subset'])}")
190
+
191
+ return classified
192
+
193
+ def _select_chunks_by_intention(self, intention: Optional[str],
194
+ emergency_chunks: List, treatment_chunks: List,
195
+ symptom_chunks: List, diagnosis_chunks: List) -> List:
196
+ """
197
+ Select optimal chunk combination based on query intention
198
+
199
+ Args:
200
+ intention: Detected or specified intention
201
+ *_chunks: Chunks from different dataset sources
202
+
203
+ Returns:
204
+ List of selected chunks for prompt construction
205
+ """
206
+ if intention and intention in self.dataset_priorities:
207
+ # Use predefined priorities for known intentions
208
+ priorities = self.dataset_priorities[intention]
209
+ selected_chunks = []
210
+
211
+ # Add chunks according to priority allocation
212
+ selected_chunks.extend(emergency_chunks[:priorities["emergency_subset"]])
213
+ selected_chunks.extend(treatment_chunks[:priorities["treatment_subset"]])
214
+
215
+ # TODO: Future Dataset B integration
216
+ # selected_chunks.extend(symptom_chunks[:priorities["symptom_subset"]])
217
+ # selected_chunks.extend(diagnosis_chunks[:priorities["diagnosis_subset"]])
218
+
219
+ logger.info(f"Selected chunks by intention '{intention}': {len(selected_chunks)} total")
220
+
221
+ else:
222
+ # No specific intention - let LLM judge from best available chunks
223
+ all_chunks = emergency_chunks + treatment_chunks + symptom_chunks + diagnosis_chunks
224
+
225
+ # Sort by relevance (distance) and take top 6
226
+ all_chunks_sorted = sorted(all_chunks, key=lambda x: x.get("distance", 999))
227
+ selected_chunks = all_chunks_sorted[:6]
228
+
229
+ logger.info(f"Selected chunks by relevance (no intention): {len(selected_chunks)} total")
230
+
231
+ return selected_chunks
232
+
233
+ def _build_context_block(self, selected_chunks: List) -> str:
234
+ """
235
+ Build formatted context block from selected chunks
236
+
237
+ Args:
238
+ selected_chunks: List of selected chunks
239
+
240
+ Returns:
241
+ Formatted context string for prompt
242
+ """
243
+ if not selected_chunks:
244
+ return "No relevant medical guidelines found."
245
+
246
+ context_parts = []
247
+
248
+ for i, chunk in enumerate(selected_chunks, 1):
249
+ chunk_text = chunk.get("text", "").strip()
250
+ chunk_type = chunk.get("type", "unknown")
251
+ distance = chunk.get("distance", 0)
252
+
253
+ # Format each chunk with metadata
254
+ context_part = f"""
255
+ [Guideline {i}] (Source: {chunk_type.title()}, Relevance: {1-distance:.3f})
256
+ {chunk_text}
257
+ """.strip()
258
+
259
+ context_parts.append(context_part)
260
+
261
+ return "\n\n".join(context_parts)
262
+
263
+ def _construct_medical_prompt(self, user_query: str, context_block: str,
264
+ intention: Optional[str]) -> str:
265
+ """
266
+ Construct final medical RAG prompt with appropriate framing
267
+
268
+ Args:
269
+ user_query: Original user query
270
+ context_block: Formatted context from selected chunks
271
+ intention: Query intention if detected
272
+
273
+ Returns:
274
+ Complete RAG prompt for Med42-70B
275
+ """
276
+ # Customize prompt based on intention
277
+ if intention == "treatment":
278
+ focus_guidance = "Focus on providing specific treatment protocols, management steps, and therapeutic interventions."
279
+ elif intention == "diagnosis":
280
+ focus_guidance = "Focus on differential diagnosis, diagnostic criteria, and assessment approaches."
281
+ elif intention == "STAT(tentative)":
282
+ focus_guidance = "Focus on immediate emergency interventions and critical decision-making steps."
283
+ else:
284
+ focus_guidance = "Provide comprehensive medical guidance covering both diagnostic and treatment aspects as appropriate."
285
+
286
+ prompt = f"""You are an experienced attending physician providing guidance to a junior clinician in an emergency setting. A colleague is asking for your expert medical opinion.
287
+
288
+ Clinical Question:
289
+ {user_query}
290
+
291
+ Relevant Medical Guidelines:
292
+ {context_block}
293
+
294
+ Instructions:
295
+ {focus_guidance}
296
+
297
+ Please provide a clear, actionable response that:
298
+ 1. Addresses the specific clinical question asked
299
+ 2. References relevant evidence from the provided guidelines
300
+ 3. Offers practical, step-by-step guidance when appropriate
301
+ 4. Maintains appropriate medical caution and emphasizes the need for clinical judgment
302
+
303
+ Your response should be concise but comprehensive, suitable for immediate clinical application."""
304
+
305
+ return prompt
306
+
307
+ def _generate_with_med42(self, prompt: str) -> Dict[str, Any]:
308
+ """
309
+ Generate medical advice using Med42-70B
310
+
311
+ Args:
312
+ prompt: Complete RAG prompt
313
+
314
+ Returns:
315
+ Generation result with metadata
316
+ """
317
+ try:
318
+ logger.info("Calling Med42-70B for medical advice generation")
319
+
320
+ result = self.llm_client.analyze_medical_query(
321
+ query=prompt,
322
+ max_tokens=500, # Adjust based on needs
323
+ timeout=30.0 # Allow more time for complex medical advice
324
+ )
325
+
326
+ if result.get('error'):
327
+ raise Exception(f"Med42-70B generation error: {result['error']}")
328
+
329
+ return result
330
+
331
+ except Exception as e:
332
+ logger.error(f"Med42-70B generation failed: {e}")
333
+ raise
334
+
335
+ def _format_medical_response(self, user_query: str, generated_advice: Dict[str, Any],
336
+ chunks_used: Dict[str, List], intention: Optional[str],
337
+ processing_time: float) -> Dict[str, Any]:
338
+ """
339
+ Format final medical response with metadata and confidence assessment
340
+
341
+ Args:
342
+ user_query: Original query
343
+ generated_advice: Result from Med42-70B
344
+ chunks_used: Classification of chunks used
345
+ intention: Detected intention
346
+ processing_time: Total processing time
347
+
348
+ Returns:
349
+ Structured medical advice response
350
+ """
351
+ # Extract generated content
352
+ advice_content = generated_advice.get('extracted_condition', '')
353
+ if not advice_content:
354
+ advice_content = generated_advice.get('raw_response', 'Unable to generate medical advice.')
355
+
356
+ # Calculate confidence based on available factors
357
+ confidence_score = self._calculate_confidence_score(generated_advice, chunks_used)
358
+
359
+ # Count chunks used by source
360
+ chunk_counts = {source: len(chunks) for source, chunks in chunks_used.items()}
361
+ total_chunks = sum(chunk_counts.values())
362
+
363
+ formatted_response = {
364
+ "medical_advice": advice_content,
365
+ "confidence_score": confidence_score,
366
+ "query_metadata": {
367
+ "original_query": user_query,
368
+ "detected_intention": intention,
369
+ "processing_time_seconds": processing_time,
370
+ "total_chunks_used": total_chunks,
371
+ "chunks_by_source": chunk_counts
372
+ },
373
+ "generation_metadata": {
374
+ "model_used": "m42-health/Llama3-Med42-70B",
375
+ "generation_time": generated_advice.get('latency', 0),
376
+ "model_confidence": generated_advice.get('confidence', 'unknown'),
377
+ "timestamp": datetime.now().isoformat()
378
+ },
379
+ "sources": {
380
+ "emergency_sources": len(chunks_used.get("emergency_subset", [])),
381
+ "treatment_sources": len(chunks_used.get("treatment_subset", [])),
382
+ "total_sources": total_chunks
383
+ },
384
+ "disclaimer": "This advice is for informational purposes only and should not replace professional medical consultation. Always consult with qualified healthcare providers for medical decisions."
385
+ }
386
+
387
+ return formatted_response
388
+
389
+ def _calculate_confidence_score(self, generated_advice: Dict[str, Any],
390
+ chunks_used: Dict[str, List]) -> float:
391
+ """
392
+ Calculate confidence score based on generation quality and source reliability
393
+
394
+ Args:
395
+ generated_advice: Result from Med42-70B
396
+ chunks_used: Chunks used in generation
397
+
398
+ Returns:
399
+ Confidence score between 0.0 and 1.0
400
+ """
401
+ confidence_factors = []
402
+
403
+ # Factor 1: Model confidence if available
404
+ model_confidence = generated_advice.get('confidence', '0.5')
405
+ try:
406
+ model_conf_value = float(model_confidence)
407
+ confidence_factors.append(model_conf_value)
408
+ except (ValueError, TypeError):
409
+ confidence_factors.append(0.5) # Default neutral confidence
410
+
411
+ # Factor 2: Number of sources used (more sources = higher confidence)
412
+ total_chunks = sum(len(chunks) for chunks in chunks_used.values())
413
+ source_confidence = min(total_chunks / 6.0, 1.0) # Normalize to max 6 chunks
414
+ confidence_factors.append(source_confidence)
415
+
416
+ # Factor 3: Response length (reasonable length indicates comprehensive advice)
417
+ response_length = len(generated_advice.get('raw_response', ''))
418
+ length_confidence = min(response_length / 500.0, 1.0) # Normalize to ~500 chars
419
+ confidence_factors.append(length_confidence)
420
+
421
+ # Factor 4: Processing success (no errors = higher confidence)
422
+ if generated_advice.get('error'):
423
+ confidence_factors.append(0.3) # Lower confidence if errors occurred
424
+ else:
425
+ confidence_factors.append(0.8) # Higher confidence for clean generation
426
+
427
+ # Calculate weighted average
428
+ final_confidence = sum(confidence_factors) / len(confidence_factors)
429
+
430
+ # Ensure confidence is within valid range
431
+ return max(0.1, min(0.95, final_confidence))
432
+
433
+ def _generate_error_response(self, user_query: str, error_message: str) -> Dict[str, Any]:
434
+ """
435
+ Generate error response when generation fails
436
+
437
+ Args:
438
+ user_query: Original query
439
+ error_message: Error details
440
+
441
+ Returns:
442
+ Error response in standard format
443
+ """
444
+ return {
445
+ "medical_advice": "I apologize, but I encountered an error while processing your medical query. Please try rephrasing your question or contact technical support if the issue persists.",
446
+ "confidence_score": 0.0,
447
+ "query_metadata": {
448
+ "original_query": user_query,
449
+ "detected_intention": None,
450
+ "processing_time_seconds": 0.0,
451
+ "total_chunks_used": 0,
452
+ "chunks_by_source": {}
453
+ },
454
+ "generation_metadata": {
455
+ "model_used": "m42-health/Llama3-Med42-70B",
456
+ "error": error_message,
457
+ "timestamp": datetime.now().isoformat()
458
+ },
459
+ "sources": {
460
+ "emergency_sources": 0,
461
+ "treatment_sources": 0,
462
+ "total_sources": 0
463
+ },
464
+ "disclaimer": "This system experienced a technical error. Please consult with qualified healthcare providers for medical decisions."
465
+ }
466
+
467
+ # Example usage and testing
468
+ def main():
469
+ """
470
+ Test the medical advice generation system
471
+ """
472
+ # Initialize generator
473
+ generator = MedicalAdviceGenerator()
474
+
475
+ # Example retrieval results (simulated)
476
+ example_retrieval_results = {
477
+ "processed_results": [
478
+ {
479
+ "type": "emergency",
480
+ "distance": 0.3,
481
+ "text": "Acute myocardial infarction requires immediate assessment including ECG, cardiac enzymes, and chest X-ray. Time-sensitive condition requiring rapid intervention.",
482
+ "matched": "MI|chest pain"
483
+ },
484
+ {
485
+ "type": "treatment",
486
+ "distance": 0.25,
487
+ "text": "Treatment protocol for STEMI includes aspirin 325mg, clopidogrel loading dose, and urgent PCI within 90 minutes when available.",
488
+ "matched_treatment": "aspirin|PCI|thrombolytic"
489
+ }
490
+ ]
491
+ }
492
+
493
+ # Test queries
494
+ test_queries = [
495
+ ("How should I treat a patient with chest pain?", "treatment"),
496
+ ("What are the signs of acute MI?", "diagnosis"),
497
+ # ("Emergency management of cardiac arrest", "STAT(tentative)")
498
+ ]
499
+
500
+ for query, intention in test_queries:
501
+ print(f"\n{'='*60}")
502
+ print(f"Testing: {query}")
503
+ print(f"Intention: {intention}")
504
+
505
+ try:
506
+ result = generator.generate_medical_advice(
507
+ user_query=query,
508
+ retrieval_results=example_retrieval_results,
509
+ intention=intention
510
+ )
511
+
512
+ print(f"✅ Success: {result['confidence_score']:.2f} confidence")
513
+ print(f"Advice: {result['medical_advice'][:200]}...")
514
+
515
+ except Exception as e:
516
+ print(f"❌ Error: {e}")
517
+
518
+ if __name__ == "__main__":
519
+ main()
src/llm_clients.py ADDED
@@ -0,0 +1,308 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ OnCall.ai LLM Clients Module
3
+
4
+ Provides specialized LLM clients for medical query processing.
5
+
6
+ Author: OnCall.ai Team
7
+ Date: 2025-07-29
8
+ """
9
+
10
+ import logging
11
+ import os
12
+ from typing import Dict, Optional, Union
13
+ from huggingface_hub import InferenceClient
14
+ from dotenv import load_dotenv
15
+
16
+ # Load environment variables from .env file
17
+ load_dotenv()
18
+
19
+ class llm_Med42_70BClient:
20
+ def __init__(
21
+ self,
22
+ model_name: str = "m42-health/Llama3-Med42-70B",
23
+ timeout: float = 30.0
24
+ ):
25
+ """
26
+ Initialize Medical LLM client for query processing.
27
+
28
+ Args:
29
+ model_name: Hugging Face model name
30
+ timeout: API call timeout duration
31
+
32
+ Warning: This model should not be used for professional medical advice.
33
+ """
34
+ self.logger = logging.getLogger(__name__)
35
+ self.timeout = timeout
36
+
37
+ # Configure logging to show detailed information
38
+ logging.basicConfig(
39
+ level=logging.INFO,
40
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
41
+ )
42
+
43
+ # Get Hugging Face token from environment
44
+ hf_token = os.getenv('HF_TOKEN')
45
+ if not hf_token:
46
+ self.logger.error("HF_TOKEN is missing from environment variables.")
47
+ raise ValueError(
48
+ "HF_TOKEN not found in environment variables. "
49
+ "Please set HF_TOKEN in your .env file or environment. "
50
+ "Ensure the token is not empty and is correctly set."
51
+ )
52
+
53
+ try:
54
+ # Initialize InferenceClient with the new model
55
+ self.client = InferenceClient(
56
+ provider="featherless-ai",
57
+ api_key=hf_token
58
+ )
59
+
60
+ self.logger.info(f"Medical LLM client initialized with model: {model_name}")
61
+ self.logger.warning(
62
+ "Medical LLM Model: Research tool only. "
63
+ "Not for professional medical diagnosis."
64
+ )
65
+ except Exception as e:
66
+ self.logger.error(f"Failed to initialize InferenceClient: {str(e)}")
67
+ self.logger.error(f"Error Type: {type(e).__name__}")
68
+ self.logger.error(f"Detailed Error: {repr(e)}")
69
+ raise ValueError(f"Failed to initialize Medical LLM client: {str(e)}") from e
70
+
71
+ def analyze_medical_query(
72
+ self,
73
+ query: str,
74
+ max_tokens: int = 100,
75
+ timeout: Optional[float] = None
76
+ ) -> Dict[str, Union[str, float]]:
77
+ """
78
+ Analyze medical query and extract condition.
79
+
80
+ Args:
81
+ query: Medical query text
82
+ max_tokens: Maximum tokens to generate
83
+ timeout: Specific API call timeout
84
+
85
+ Returns:
86
+ Extracted medical condition information with latency
87
+ """
88
+ import time
89
+
90
+ # Start timing
91
+ start_time = time.time()
92
+
93
+ try:
94
+ self.logger.info(f"Calling Medical LLM with query: {query}")
95
+
96
+ # Prepare chat completion request with updated system prompt
97
+ response = self.client.chat.completions.create(
98
+ model="m42-health/Llama3-Med42-70B",
99
+ messages=[
100
+ {
101
+ "role": "system",
102
+ "content": """You are a medical assistant trained to extract medical conditions.
103
+
104
+ For medical queries: Extract the most representative medical condition name.
105
+ For non-medical queries: Respond with "NON_MEDICAL_QUERY" and briefly explain why it's not medical.
106
+
107
+ Examples:
108
+ - Medical: "chest pain" → "Acute Coronary Syndrome"
109
+ - Non-medical: "cooking pasta" → "NON_MEDICAL_QUERY. This is about culinary techniques, not medical conditions."
110
+
111
+ DO NOT provide medical advice."""
112
+ },
113
+ {
114
+ "role": "user",
115
+ "content": query
116
+ }
117
+ ],
118
+ max_tokens=max_tokens
119
+ )
120
+
121
+ # Calculate latency
122
+ end_time = time.time()
123
+ latency = end_time - start_time
124
+
125
+ # Extract the response text
126
+ response_text = response.choices[0].message.content or ""
127
+
128
+ # Log raw response and latency
129
+ self.logger.info(f"Raw LLM Response: {response_text}")
130
+ self.logger.info(f"Query Latency: {latency:.4f} seconds")
131
+
132
+ # Extract condition from response
133
+ extracted_condition = self._extract_condition(response_text)
134
+
135
+ # Log the extracted condition
136
+ self.logger.info(f"Extracted Condition: {extracted_condition}")
137
+
138
+ return {
139
+ 'extracted_condition': extracted_condition,
140
+ 'confidence': '0.8',
141
+ 'raw_response': response_text,
142
+ 'latency': latency # Add latency to the return dictionary
143
+ }
144
+
145
+ except Exception as e:
146
+ # Calculate latency even for failed requests
147
+ end_time = time.time()
148
+ latency = end_time - start_time
149
+
150
+ self.logger.error(f"Medical LLM query error: {str(e)}")
151
+ self.logger.error(f"Error Type: {type(e).__name__}")
152
+ self.logger.error(f"Detailed Error: {repr(e)}")
153
+ self.logger.error(f"Query Latency (on error): {latency:.4f} seconds")
154
+
155
+ # Additional context logging
156
+ self.logger.error(f"Query that caused error: {query}")
157
+
158
+ return {
159
+ 'extracted_condition': '',
160
+ 'confidence': '0',
161
+ 'error': str(e),
162
+ 'latency': latency # Include latency even for error cases
163
+ }
164
+
165
+ def _extract_condition(self, response: str) -> str:
166
+ """
167
+ Extract medical condition from model response.
168
+
169
+ Args:
170
+ response: Full model-generated text
171
+
172
+ Returns:
173
+ Extracted medical condition or empty string if non-medical
174
+ """
175
+ # Check if this is a rejection response first
176
+ if self._is_rejection_response(response):
177
+ return ""
178
+
179
+ from medical_conditions import CONDITION_KEYWORD_MAPPING
180
+
181
+ # Search in known medical conditions
182
+ for condition in CONDITION_KEYWORD_MAPPING.keys():
183
+ if condition.lower() in response.lower():
184
+ return condition
185
+
186
+ return response.split('\n')[0].strip() or ""
187
+
188
+ def _is_rejection_response(self, response: str) -> bool:
189
+ """
190
+ Dual-layer detection: prompt compliance + natural language patterns
191
+
192
+ Args:
193
+ response: LLM response text
194
+
195
+ Returns:
196
+ True if response indicates non-medical query rejection
197
+ """
198
+ response_upper = response.upper()
199
+ response_lower = response.lower()
200
+
201
+ # Layer 1: Check for standardized format (if LLM follows prompt)
202
+ if "NON_MEDICAL_QUERY" in response_upper:
203
+ return True
204
+
205
+ # Layer 2: Check natural language rejection patterns (fallback)
206
+ rejection_patterns = [
207
+ "i do not address",
208
+ "do not address",
209
+ "outside my biomedical scope",
210
+ "outside my medical scope",
211
+ "unrelated to medical conditions",
212
+ "not about a medical condition",
213
+ "not a medical condition",
214
+ "this query is outside",
215
+ "culinary practice", # cooking-related
216
+ "technology trends", # programming-related
217
+ "meteorology", # weather-related
218
+ "non-medical context"
219
+ ]
220
+
221
+ return any(pattern in response_lower for pattern in rejection_patterns)
222
+
223
+ def main():
224
+ """
225
+ Test Medical LLM client functionality
226
+ """
227
+ import time
228
+ from datetime import datetime
229
+
230
+ # Record total execution start time
231
+ total_start_time = time.time()
232
+ execution_start_timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
233
+
234
+ try:
235
+ print(f"Execution Started at: {execution_start_timestamp}")
236
+
237
+ # Test client initialization
238
+ client = llm_Med42_70BClient()
239
+
240
+ test_queries = [
241
+ "patient experiencing chest pain",
242
+ "sudden weakness on one side",
243
+ "severe headache with neurological symptoms"
244
+ ]
245
+
246
+ # Store individual query results
247
+ query_results = []
248
+
249
+ for query in test_queries:
250
+ print(f"\nTesting query: {query}")
251
+ result = client.analyze_medical_query(query)
252
+
253
+ # Store query result
254
+ query_result = {
255
+ 'query': query,
256
+ 'extracted_condition': result.get('extracted_condition', 'N/A'),
257
+ 'confidence': result.get('confidence', 'N/A'),
258
+ 'latency': result.get('latency', 'N/A')
259
+ }
260
+ query_results.append(query_result)
261
+
262
+ # Print individual query results
263
+ print("Extracted Condition:", query_result['extracted_condition'])
264
+ print("Confidence:", query_result['confidence'])
265
+ print(f"Latency: {query_result['latency']:.4f} seconds")
266
+
267
+ if 'error' in result:
268
+ print("Error:", result['error'])
269
+ print("---")
270
+
271
+ # Calculate total execution time
272
+ total_end_time = time.time()
273
+ total_execution_time = total_end_time - total_start_time
274
+ execution_end_timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
275
+
276
+ # Print summary
277
+ print("\n--- Execution Summary ---")
278
+ print(f"Execution Started at: {execution_start_timestamp}")
279
+ print(f"Execution Ended at: {execution_end_timestamp}")
280
+ print(f"Total Execution Time: {total_execution_time:.4f} seconds")
281
+
282
+ # Optional: Return results for potential further processing
283
+ return {
284
+ 'start_time': execution_start_timestamp,
285
+ 'end_time': execution_end_timestamp,
286
+ 'total_execution_time': total_execution_time,
287
+ 'query_results': query_results
288
+ }
289
+
290
+ except Exception as e:
291
+ print(f"Client initialization error: {str(e)}")
292
+ print("Possible issues:")
293
+ print("1. Invalid or missing Hugging Face token")
294
+ print("2. Network connectivity problems")
295
+ print("3. Model access restrictions")
296
+ print("\nPlease check your .env file and Hugging Face token.")
297
+
298
+ # Calculate total execution time even in case of error
299
+ total_end_time = time.time()
300
+ total_execution_time = total_end_time - total_start_time
301
+
302
+ return {
303
+ 'error': str(e),
304
+ 'total_execution_time': total_execution_time
305
+ }
306
+
307
+ if __name__ == "__main__":
308
+ main()
src/medical_conditions.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ OnCall.ai Medical Conditions Configuration
3
+
4
+ This module provides centralized configuration for:
5
+ 1. Predefined medical conditions
6
+ 2. Condition-to-keyword mappings
7
+ 3. Fallback condition keywords
8
+
9
+ Author: OnCall.ai Team
10
+ Date: 2025-07-29
11
+ """
12
+
13
+ from typing import Dict, Optional
14
+
15
+ # Comprehensive Condition-to-Keyword Mapping
16
+ CONDITION_KEYWORD_MAPPING: Dict[str, Dict[str, str]] = {
17
+ "acute myocardial infarction": {
18
+ "emergency": "MI|chest pain|cardiac arrest",
19
+ "treatment": "aspirin|nitroglycerin|thrombolytic|PCI"
20
+ },
21
+ "acute stroke": {
22
+ "emergency": "stroke|neurological deficit|sudden weakness",
23
+ "treatment": "tPA|thrombolysis|stroke unit care"
24
+ },
25
+ "pulmonary embolism": {
26
+ "emergency": "chest pain|shortness of breath|sudden dyspnea",
27
+ "treatment": "anticoagulation|heparin|embolectomy"
28
+ },
29
+ # extended from @20250729Test_Retrieval.md
30
+ "acute_ischemic_stroke": {
31
+ "emergency": "ischemic stroke|neurological deficit",
32
+ "treatment": "tPA|stroke unit management"
33
+ },
34
+ "hemorrhagic_stroke": {
35
+ "emergency": "hemorrhagic stroke|intracranial bleeding",
36
+ "treatment": "blood pressure control|neurosurgery"
37
+ },
38
+ "transient_ischemic_attack": {
39
+ "emergency": "TIA|temporary stroke symptoms",
40
+ "treatment": "antiplatelet|lifestyle modification"
41
+ },
42
+ "acute_coronary_syndrome": {
43
+ "emergency": "ACS|chest pain|ECG changes",
44
+ "treatment": "antiplatelet|statins|cardiac monitoring"
45
+ }
46
+ }
47
+
48
+ # Fallback Condition Keywords
49
+ FALLBACK_CONDITION_KEYWORDS: Dict[str, str] = {
50
+ "acute_ischemic_stroke": "acute ischemic stroke treatment",
51
+ "hemorrhagic_stroke": "hemorrhagic stroke management",
52
+ "transient_ischemic_attack": "TIA treatment protocol",
53
+ "acute_coronary_syndrome": "ACS treatment guidelines",
54
+ "stable_angina": "stable angina management",
55
+ "non_cardiac_chest_pain": "non-cardiac chest pain evaluation",
56
+ "witnessed_cardiac_arrest": "witnessed cardiac arrest protocol",
57
+ "unwitnessed_cardiac_arrest": "unwitnessed cardiac arrest management",
58
+ "post_resuscitation_care": "post-resuscitation care guidelines"
59
+ }
60
+
61
+ def get_condition_keywords(specific_condition: str) -> Optional[str]:
62
+ """
63
+ Retrieve fallback keywords for a specific condition
64
+
65
+ Args:
66
+ specific_condition: Medical condition name
67
+
68
+ Returns:
69
+ Corresponding keywords or the original condition
70
+ """
71
+ return FALLBACK_CONDITION_KEYWORDS.get(specific_condition, specific_condition)
72
+
73
+ def validate_condition(condition: str) -> bool:
74
+ """
75
+ Check if a condition exists in our predefined mapping
76
+
77
+ Args:
78
+ condition: Medical condition to validate
79
+
80
+ Returns:
81
+ Boolean indicating condition validity
82
+ """
83
+ return condition.lower() in {k.lower() for k in CONDITION_KEYWORD_MAPPING.keys()}
84
+
85
+ def get_condition_details(condition: str) -> Optional[Dict[str, str]]:
86
+ """
87
+ Retrieve detailed information for a specific condition
88
+
89
+ Args:
90
+ condition: Medical condition name
91
+
92
+ Returns:
93
+ Dict with emergency and treatment keywords, or None
94
+ """
95
+ normalized_condition = condition.lower()
96
+ for key, value in CONDITION_KEYWORD_MAPPING.items():
97
+ if key.lower() == normalized_condition:
98
+ return value
99
+ return None
src/retrieval.py ADDED
@@ -0,0 +1,391 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Basic Retrieval System for OnCall.ai
3
+
4
+ This module implements the core vector retrieval functionality:
5
+ - Basic vector search
6
+ - Source marking
7
+ - Unified output format
8
+ """
9
+
10
+ import numpy as np
11
+ import json
12
+ from pathlib import Path
13
+ from typing import Dict, List, Tuple, Any, Optional
14
+ from sentence_transformers import SentenceTransformer
15
+ from annoy import AnnoyIndex
16
+ import logging
17
+
18
+ # Configure logging
19
+ logging.basicConfig(
20
+ level=logging.INFO,
21
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
22
+ )
23
+ logger = logging.getLogger(__name__)
24
+
25
+ class BasicRetrievalSystem:
26
+ """Basic vector retrieval system for medical documents"""
27
+
28
+ def __init__(self, embedding_dim: int = 768):
29
+ """
30
+ Initialize the retrieval system
31
+
32
+ Args:
33
+ embedding_dim: Dimension of embeddings (default: 768 for PubMedBERT)
34
+ """
35
+ self.embedding_dim = embedding_dim
36
+ self.embedding_model = None
37
+ self.emergency_index = None
38
+ self.treatment_index = None
39
+ self.emergency_chunks = {}
40
+ self.treatment_chunks = {}
41
+
42
+ # Initialize system
43
+ self._initialize_system()
44
+
45
+ def _initialize_system(self) -> None:
46
+ """Initialize embeddings, indices and chunks"""
47
+ try:
48
+ logger.info("Initializing retrieval system...")
49
+
50
+ # Initialize embedding model
51
+ self.embedding_model = SentenceTransformer("NeuML/pubmedbert-base-embeddings")
52
+ logger.info("Embedding model loaded successfully")
53
+
54
+ # Initialize Annoy indices
55
+ self.emergency_index = AnnoyIndex(self.embedding_dim, 'angular')
56
+ self.treatment_index = AnnoyIndex(self.embedding_dim, 'angular')
57
+
58
+ # Load data
59
+ current_file = Path(__file__)
60
+ project_root = current_file.parent.parent # from src to root
61
+ base_path = project_root / "models"
62
+ self._load_chunks(base_path)
63
+ self._load_embeddings(base_path)
64
+ self._build_or_load_indices(base_path)
65
+
66
+ logger.info("Retrieval system initialized successfully")
67
+
68
+ except Exception as e:
69
+ logger.error(f"Failed to initialize retrieval system: {e}")
70
+ raise
71
+
72
+ def _load_chunks(self, base_path: Path) -> None:
73
+ """Load chunk data from JSON files"""
74
+ try:
75
+ # Load emergency chunks
76
+ with open(base_path / "embeddings" / "emergency_chunks.json", 'r') as f:
77
+ self.emergency_chunks = json.load(f)
78
+
79
+ # Load treatment chunks
80
+ with open(base_path / "embeddings" / "treatment_chunks.json", 'r') as f:
81
+ self.treatment_chunks = json.load(f)
82
+
83
+ logger.info("Chunks loaded successfully")
84
+
85
+ except FileNotFoundError as e:
86
+ logger.error(f"Chunk file not found: {e}")
87
+ raise
88
+ except json.JSONDecodeError as e:
89
+ logger.error(f"Invalid JSON in chunk file: {e}")
90
+ raise
91
+
92
+ def _load_embeddings(self, base_path: Path) -> None:
93
+ """Load pre-computed embeddings"""
94
+ try:
95
+ # Load emergency embeddings
96
+ self.emergency_embeddings = np.load(
97
+ base_path / "embeddings" / "emergency_embeddings.npy"
98
+ )
99
+
100
+ # Load treatment embeddings
101
+ self.treatment_embeddings = np.load(
102
+ base_path / "embeddings" / "treatment_embeddings.npy"
103
+ )
104
+
105
+ logger.info("Embeddings loaded successfully")
106
+
107
+ except Exception as e:
108
+ logger.error(f"Failed to load embeddings: {e}")
109
+ raise
110
+
111
+ def _build_or_load_indices(self, base_path: Path) -> None:
112
+ """Build or load Annoy indices"""
113
+ indices_path = base_path / "indices" / "annoy"
114
+ emergency_index_path = indices_path / "emergency.ann"
115
+ treatment_index_path = indices_path / "treatment.ann"
116
+
117
+ try:
118
+ # Emergency index
119
+ if emergency_index_path.exists():
120
+ self.emergency_index.load(str(emergency_index_path))
121
+ logger.info("Loaded existing emergency index")
122
+ else:
123
+ self._build_index(
124
+ self.emergency_embeddings,
125
+ self.emergency_index,
126
+ emergency_index_path
127
+ )
128
+ logger.info("Built new emergency index")
129
+
130
+ # Treatment index
131
+ if treatment_index_path.exists():
132
+ self.treatment_index.load(str(treatment_index_path))
133
+ logger.info("Loaded existing treatment index")
134
+ else:
135
+ self._build_index(
136
+ self.treatment_embeddings,
137
+ self.treatment_index,
138
+ treatment_index_path
139
+ )
140
+ logger.info("Built new treatment index")
141
+
142
+ except Exception as e:
143
+ logger.error(f"Failed to build/load indices: {e}")
144
+ raise
145
+
146
+ def _build_index(self, embeddings: np.ndarray, index: AnnoyIndex,
147
+ save_path: Path, n_trees: int = 15) -> None:
148
+ """
149
+ Build and save Annoy index
150
+
151
+ Args:
152
+ embeddings: Embedding vectors
153
+ index: AnnoyIndex instance
154
+ save_path: Path to save the index
155
+ n_trees: Number of trees for Annoy index (default: 15)
156
+ """
157
+ try:
158
+ for i, vec in enumerate(embeddings):
159
+ index.add_item(i, vec)
160
+ index.build(n_trees)
161
+ save_path.parent.mkdir(parents=True, exist_ok=True)
162
+ index.save(str(save_path))
163
+
164
+ except Exception as e:
165
+ logger.error(f"Failed to build index: {e}")
166
+ raise
167
+
168
+ def search(self, query: str, top_k: int = 5) -> Dict[str, Any]:
169
+ """
170
+ Perform vector search on both indices
171
+
172
+ Args:
173
+ query: Search query
174
+ top_k: Number of results to return from each index
175
+
176
+ Returns:
177
+ Dict containing search results and metadata
178
+ """
179
+ try:
180
+ # Get query embedding
181
+ query_embedding = self.embedding_model.encode([query])[0]
182
+
183
+ # Search both indices
184
+ emergency_results = self._search_index(
185
+ query_embedding,
186
+ self.emergency_index,
187
+ self.emergency_chunks,
188
+ "emergency",
189
+ top_k
190
+ )
191
+
192
+ treatment_results = self._search_index(
193
+ query_embedding,
194
+ self.treatment_index,
195
+ self.treatment_chunks,
196
+ "treatment",
197
+ top_k
198
+ )
199
+
200
+ # Log individual index results
201
+ logger.info(f"Search results: Emergency={len(emergency_results)}, Treatment={len(treatment_results)}")
202
+
203
+ results = {
204
+ "query": query,
205
+ "emergency_results": emergency_results,
206
+ "treatment_results": treatment_results,
207
+ "total_results": len(emergency_results) + len(treatment_results)
208
+ }
209
+
210
+ # Post-process results
211
+ processed_results = self.post_process_results(results)
212
+
213
+ return processed_results
214
+
215
+ except Exception as e:
216
+ logger.error(f"Search failed: {e}")
217
+ raise
218
+
219
+ def _search_index(self, query_embedding: np.ndarray, index: AnnoyIndex,
220
+ chunks: Dict, source_type: str, top_k: int) -> List[Dict]:
221
+ """
222
+ Search a single index and format results
223
+
224
+ Args:
225
+ query_embedding: Query vector
226
+ index: AnnoyIndex to search
227
+ chunks: Chunk data
228
+ source_type: Type of source ("emergency" or "treatment")
229
+ top_k: Number of results to return
230
+
231
+ Returns:
232
+ List of formatted results
233
+ """
234
+ # Get nearest neighbors
235
+ indices, distances = index.get_nns_by_vector(
236
+ query_embedding, top_k, include_distances=True
237
+ )
238
+
239
+ # Format results
240
+ results = []
241
+ for idx, distance in zip(indices, distances):
242
+ chunk_data = chunks[idx] # chunks is a list, use integer index directly
243
+ result = {
244
+ "type": source_type, # Using 'type' to match metadata
245
+ "chunk_id": idx,
246
+ "distance": distance,
247
+ "text": chunk_data.get("text", ""),
248
+ "matched": chunk_data.get("matched", ""),
249
+ "matched_treatment": chunk_data.get("matched_treatment", "")
250
+ }
251
+ results.append(result)
252
+
253
+ return results
254
+
255
+ def post_process_results(self, results: Dict[str, Any]) -> Dict[str, Any]:
256
+ """
257
+ Post-process search results
258
+ - Remove duplicates
259
+ - Sort by distance
260
+ - Add metadata enrichment
261
+
262
+ Args:
263
+ results: Raw search results
264
+
265
+ Returns:
266
+ Processed results
267
+ """
268
+ try:
269
+ emergency_results = results["emergency_results"]
270
+ treatment_results = results["treatment_results"]
271
+
272
+ # Combine all results
273
+ all_results = emergency_results + treatment_results
274
+
275
+ # Remove duplicates based on exact text matching
276
+ unique_results = self._remove_duplicates(all_results)
277
+
278
+ # Sort by distance
279
+ sorted_results = sorted(unique_results, key=lambda x: x["distance"])
280
+
281
+ return {
282
+ "query": results["query"],
283
+ "processed_results": sorted_results,
284
+ "total_results": len(sorted_results),
285
+ "processing_info": {
286
+ "duplicates_removed": len(all_results) - len(unique_results)
287
+ }
288
+ }
289
+
290
+ except Exception as e:
291
+ logger.error(f"Post-processing failed: {e}")
292
+ raise
293
+
294
+ def _remove_duplicates(self, results: List[Dict]) -> List[Dict]:
295
+ """
296
+ Remove duplicate results based on exact text matching
297
+
298
+ Args:
299
+ results: List of search results
300
+
301
+ Returns:
302
+ Deduplicated results with logging statistics
303
+ """
304
+ original_count = len(results)
305
+ seen_texts = set()
306
+ unique_results = []
307
+
308
+ # Sort results by distance (ascending) to keep best matches
309
+ sorted_results = sorted(results, key=lambda x: x["distance"])
310
+
311
+ logger.info(f"Deduplication: Processing {original_count} results using text matching")
312
+
313
+ for result in sorted_results:
314
+ text = result["text"]
315
+ if text not in seen_texts:
316
+ seen_texts.add(text)
317
+ unique_results.append(result)
318
+ else:
319
+ logger.debug(f"Skipping duplicate text: {text[:50]}...")
320
+
321
+ final_count = len(unique_results)
322
+ logger.info(f"Deduplication summary: {original_count} → {final_count} results (removed {original_count - final_count})")
323
+
324
+ return unique_results
325
+
326
+ def search_sliding_window_chunks(self, query: str, top_k: int = 5, window_size: int = 256, overlap: int = 64) -> List[Dict[str, Any]]:
327
+ """
328
+ Perform semantic search using sliding window chunks
329
+
330
+ Args:
331
+ query: Search query
332
+ top_k: Number of top results to return
333
+ window_size: Size of sliding window chunks
334
+ overlap: Overlap between sliding windows
335
+
336
+ Returns:
337
+ List of search results with sliding window chunks
338
+ """
339
+ try:
340
+ # Get query embedding
341
+ query_embedding = self.embedding_model.encode([query])[0]
342
+
343
+ # Combine emergency and treatment chunks
344
+ all_chunks = self.emergency_chunks + self.treatment_chunks
345
+ all_embeddings = np.vstack([self.emergency_embeddings, self.treatment_embeddings])
346
+
347
+ # Compute cosine similarities
348
+ similarities = [
349
+ np.dot(query_embedding, chunk_emb) /
350
+ (np.linalg.norm(query_embedding) * np.linalg.norm(chunk_emb))
351
+ for chunk_emb in all_embeddings
352
+ ]
353
+
354
+ # Sort results by similarity
355
+ sorted_indices = np.argsort(similarities)[::-1]
356
+
357
+ # Prepare results
358
+ results = []
359
+ for idx in sorted_indices[:top_k]:
360
+ chunk = all_chunks[idx]
361
+ result = {
362
+ 'text': chunk.get('text', ''),
363
+ 'distance': similarities[idx],
364
+ 'type': 'emergency' if idx < len(self.emergency_chunks) else 'treatment'
365
+ }
366
+ results.append(result)
367
+
368
+ logger.info(f"Sliding window search: Found {len(results)} results")
369
+ return results
370
+
371
+ except Exception as e:
372
+ logger.error(f"Sliding window search failed: {e}")
373
+ return []
374
+
375
+ def search_generic_medical_content(self, query: str, top_k: int = 5) -> List[Dict]:
376
+ """
377
+ Perform generic medical content search
378
+
379
+ Args:
380
+ query: Search query
381
+ top_k: Number of top results to return
382
+
383
+ Returns:
384
+ List of search results
385
+ """
386
+ try:
387
+ # re-use search_sliding_window_chunks method
388
+ return self.search_sliding_window_chunks(query, top_k=top_k)
389
+ except Exception as e:
390
+ logger.error(f"Generic medical content search error: {e}")
391
+ return []
src/user_prompt.py ADDED
@@ -0,0 +1,562 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ OnCall.ai User Prompt Processing Module
3
+
4
+ This module handles:
5
+ 1. Condition extraction from user queries
6
+ 2. Keyword mapping
7
+ 3. User confirmation workflow
8
+ 4. Fallback mechanisms
9
+
10
+ Author: OnCall.ai Team
11
+ Date: 2025-07-29
12
+ """
13
+
14
+ import logging
15
+ from typing import Dict, Optional, Any, List
16
+ from sentence_transformers import SentenceTransformer
17
+ import numpy as np # Added missing import for numpy
18
+ import os # Added missing import for os
19
+ import json # Added missing import for json
20
+ import re # Added missing import for re
21
+
22
+ # Import our centralized medical conditions configuration
23
+ from medical_conditions import (
24
+ CONDITION_KEYWORD_MAPPING,
25
+ get_condition_details,
26
+ validate_condition
27
+ )
28
+
29
+ # Configure logging
30
+ logging.basicConfig(
31
+ level=logging.INFO,
32
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
33
+ )
34
+ logger = logging.getLogger(__name__)
35
+
36
+ class UserPromptProcessor:
37
+ def __init__(self, llm_client=None, retrieval_system=None):
38
+ """
39
+ Initialize UserPromptProcessor with optional LLM and retrieval system
40
+
41
+ Args:
42
+ llm_client: Optional Llama3-Med42-70B client for advanced condition extraction
43
+ retrieval_system: Optional retrieval system for semantic search
44
+ """
45
+ self.llm_client = llm_client
46
+ self.retrieval_system = retrieval_system
47
+ self.embedding_model = SentenceTransformer("NeuML/pubmedbert-base-embeddings")
48
+
49
+ # Add embeddings directory path
50
+ self.embeddings_dir = os.path.join(os.path.dirname(__file__), '..', 'models', 'embeddings')
51
+
52
+ logger.info("UserPromptProcessor initialized")
53
+
54
+ def extract_condition_keywords(self, user_query: str) -> Dict[str, str]:
55
+ """
56
+ Extract condition keywords with multi-level fallback
57
+
58
+ Args:
59
+ user_query: User's medical query
60
+
61
+ Returns:
62
+ Dict with condition and keywords
63
+ """
64
+
65
+ # Level 1: Predefined Mapping (Fast Path)
66
+ predefined_result = self._predefined_mapping(user_query)
67
+ if predefined_result:
68
+ return predefined_result
69
+
70
+ # Level 2: Llama3-Med42-70B Extraction (if available)
71
+ if self.llm_client:
72
+ llm_result = self._extract_with_llm(user_query)
73
+ if llm_result:
74
+ return llm_result
75
+
76
+ # Level 3: Semantic Search Fallback
77
+ semantic_result = self._semantic_search_fallback(user_query)
78
+ if semantic_result:
79
+ return semantic_result
80
+
81
+ # Level 4: Medical Query Validation
82
+ # Only validate if previous levels failed - speed optimization
83
+ validation_result = self.validate_medical_query(user_query)
84
+ if validation_result: # If validation fails (returns non-None)
85
+ return validation_result
86
+
87
+ # Level 5: Generic Medical Search (after validation passes)
88
+ generic_result = self._generic_medical_search(user_query)
89
+ if generic_result:
90
+ return generic_result
91
+
92
+ # No match found
93
+
94
+ return {
95
+ 'condition': '',
96
+ 'emergency_keywords': '',
97
+ 'treatment_keywords': ''
98
+ }
99
+
100
+ def _predefined_mapping(self, user_query: str) -> Optional[Dict[str, str]]:
101
+ """
102
+ Fast predefined condition mapping
103
+
104
+ Args:
105
+ user_query: User's medical query
106
+
107
+ Returns:
108
+ Mapped condition keywords or None
109
+ """
110
+ query_lower = user_query.lower()
111
+
112
+ for condition, mappings in CONDITION_KEYWORD_MAPPING.items():
113
+ if condition.lower() in query_lower:
114
+ logger.info(f"Matched predefined condition: {condition}")
115
+ return {
116
+ 'condition': condition,
117
+ 'emergency_keywords': mappings['emergency'],
118
+ 'treatment_keywords': mappings['treatment']
119
+ }
120
+
121
+ return None
122
+
123
+ def _extract_with_llm(self, user_query: str) -> Optional[Dict[str, str]]:
124
+ """
125
+ Use Llama3-Med42-70B for advanced condition extraction
126
+
127
+ Args:
128
+ user_query: User's medical query
129
+
130
+ Returns:
131
+ Dict with condition and keywords, or None
132
+ """
133
+ if not self.llm_client:
134
+ return None
135
+
136
+ try:
137
+ llama_response = self.llm_client.analyze_medical_query(
138
+ query=user_query,
139
+ max_tokens=100,
140
+ timeout=2.0
141
+ )
142
+
143
+ extracted_condition = llama_response.get('extracted_condition', '')
144
+
145
+ if extracted_condition and validate_condition(extracted_condition):
146
+ condition_details = get_condition_details(extracted_condition)
147
+ if condition_details:
148
+ return {
149
+ 'condition': extracted_condition,
150
+ 'emergency_keywords': condition_details.get('emergency', ''),
151
+ 'treatment_keywords': condition_details.get('treatment', '')
152
+ }
153
+
154
+ return None
155
+
156
+ except Exception as e:
157
+ logger.error(f"Llama3-Med42-70B condition extraction error: {e}")
158
+ return None
159
+
160
+ def _semantic_search_fallback(self, user_query: str) -> Optional[Dict[str, str]]:
161
+ """
162
+ Perform semantic search for condition extraction using sliding window chunks
163
+
164
+ Args:
165
+ user_query: User's medical query
166
+
167
+ Returns:
168
+ Dict with condition and keywords, or None
169
+ """
170
+ logger.info(f"Starting semantic search fallback for query: '{user_query}'")
171
+
172
+ if not self.retrieval_system:
173
+ logger.warning("No retrieval system available for semantic search")
174
+ return None
175
+
176
+ try:
177
+ # Perform semantic search on sliding window chunks
178
+ semantic_results = self.retrieval_system.search_sliding_window_chunks(user_query)
179
+
180
+ logger.info(f"Semantic search returned {len(semantic_results)} results")
181
+
182
+ if semantic_results:
183
+ # Extract condition from top semantic result
184
+ top_result = semantic_results[0]
185
+ condition = self._infer_condition_from_text(top_result['text'])
186
+
187
+ logger.info(f"Inferred condition: {condition}")
188
+
189
+ if condition and validate_condition(condition):
190
+ condition_details = get_condition_details(condition)
191
+ if condition_details:
192
+ result = {
193
+ 'condition': condition,
194
+ 'emergency_keywords': condition_details.get('emergency', ''),
195
+ 'treatment_keywords': condition_details.get('treatment', ''),
196
+ 'semantic_confidence': top_result.get('distance', 0)
197
+ }
198
+
199
+ logger.info(f"Semantic search successful. Condition: {condition}, "
200
+ f"Confidence: {result['semantic_confidence']}")
201
+ return result
202
+ else:
203
+ logger.warning(f"Condition validation failed for: {condition}")
204
+
205
+ logger.info("No suitable condition found in semantic search")
206
+ return None
207
+
208
+ except Exception as e:
209
+ logger.error(f"Semantic search fallback error: {e}", exc_info=True)
210
+ return None
211
+
212
+ def _generic_medical_search(self, user_query: str) -> Optional[Dict[str, str]]:
213
+ """
214
+ Perform generic medical search as final fallback
215
+
216
+ Args:
217
+ user_query: User's medical query
218
+
219
+ Returns:
220
+ Dict with generic medical keywords
221
+ """
222
+ generic_medical_terms = [
223
+ "medical", "treatment", "management", "protocol",
224
+ "guidelines", "emergency", "acute", "chronic"
225
+ ]
226
+
227
+ generic_query = f"{user_query} medical treatment"
228
+
229
+ try:
230
+ # Perform generic medical search
231
+ generic_results = self.retrieval_system.search_generic_medical_content(generic_query)
232
+
233
+ if generic_results:
234
+ return {
235
+ 'condition': 'generic medical query',
236
+ 'emergency_keywords': 'medical|emergency',
237
+ 'treatment_keywords': 'treatment|management',
238
+ 'generic_confidence': 0.5
239
+ }
240
+
241
+ return None
242
+ except Exception as e:
243
+ logger.error(f"Generic medical search error: {e}")
244
+ return None
245
+
246
+ def _infer_condition_from_text(self, text: str) -> Optional[str]:
247
+ """
248
+ Infer medical condition from text using embedding similarity
249
+
250
+ Args:
251
+ text: Input medical text
252
+
253
+ Returns:
254
+ Inferred condition or None
255
+ """
256
+ # Implement a simple condition inference using embedding similarity
257
+ # This is a placeholder and would need more sophisticated implementation
258
+ conditions = list(CONDITION_KEYWORD_MAPPING.keys())
259
+ text_embedding = self.embedding_model.encode(text)
260
+ condition_embeddings = [self.embedding_model.encode(condition) for condition in conditions]
261
+
262
+ similarities = [
263
+ np.dot(text_embedding, condition_emb) /
264
+ (np.linalg.norm(text_embedding) * np.linalg.norm(condition_emb))
265
+ for condition_emb in condition_embeddings
266
+ ]
267
+
268
+ max_similarity_index = np.argmax(similarities)
269
+ return conditions[max_similarity_index] if similarities[max_similarity_index] > 0.7 else None
270
+
271
+ def validate_keywords(self, keywords: Dict[str, str]) -> bool:
272
+ """
273
+ Validate if extracted keywords exist in our medical indices
274
+
275
+ Args:
276
+ keywords: Dict of emergency and treatment keywords
277
+
278
+ Returns:
279
+ Boolean indicating keyword validity
280
+ """
281
+ emergency_kws = keywords.get('emergency_keywords', '').split('|')
282
+ treatment_kws = keywords.get('treatment_keywords', '').split('|')
283
+
284
+ # Basic validation: check if any keyword is non-empty
285
+ return any(kw.strip() for kw in emergency_kws + treatment_kws)
286
+
287
+ def _check_keyword_in_index(self, keyword: str, index_type: str) -> bool:
288
+ """
289
+ Check if a keyword exists in the specified medical index
290
+
291
+ Args:
292
+ keyword: Keyword to check
293
+ index_type: Type of index ('emergency' or 'treatment')
294
+
295
+ Returns:
296
+ Boolean indicating keyword existence in the index
297
+ """
298
+ # Validate input parameters
299
+ if not keyword or not index_type:
300
+ logger.warning(f"Invalid input: keyword='{keyword}', index_type='{index_type}'")
301
+ return False
302
+
303
+ # Supported index types
304
+ valid_index_types = ['emergency', 'treatment']
305
+ if index_type not in valid_index_types:
306
+ logger.error(f"Unsupported index type: {index_type}")
307
+ return False
308
+
309
+ try:
310
+ # Construct path to chunks file
311
+ chunks_path = os.path.join(self.embeddings_dir, f"{index_type}_chunks.json")
312
+
313
+ # Check file existence
314
+ if not os.path.exists(chunks_path):
315
+ logger.error(f"Index file not found: {chunks_path}")
316
+ return False
317
+
318
+ # Load chunks with error handling
319
+ with open(chunks_path, 'r', encoding='utf-8') as f:
320
+ chunks = json.load(f)
321
+
322
+ # Normalize keyword for flexible matching
323
+ keyword_lower = keyword.lower().strip()
324
+
325
+ # Advanced keyword matching
326
+ for chunk in chunks:
327
+ chunk_text = chunk.get('text', '').lower()
328
+
329
+ # Exact match
330
+ if keyword_lower in chunk_text:
331
+ logger.info(f"Exact match found for '{keyword}' in {index_type} index")
332
+ return True
333
+
334
+ # Partial match with word boundaries
335
+ if re.search(r'\b' + re.escape(keyword_lower) + r'\b', chunk_text):
336
+ logger.info(f"Partial match found for '{keyword}' in {index_type} index")
337
+ return True
338
+
339
+ # No match found
340
+ logger.info(f"No match found for '{keyword}' in {index_type} index")
341
+ return False
342
+
343
+ except json.JSONDecodeError:
344
+ logger.error(f"Invalid JSON in {chunks_path}")
345
+ return False
346
+ except IOError as e:
347
+ logger.error(f"IO error reading {chunks_path}: {e}")
348
+ return False
349
+ except Exception as e:
350
+ logger.error(f"Unexpected error in _check_keyword_in_index: {e}")
351
+ return False
352
+
353
+ def handle_user_confirmation(self, extracted_info: Dict[str, str]) -> Dict[str, Any]:
354
+ """
355
+ Handle user confirmation for extracted condition and keywords
356
+
357
+ Args:
358
+ extracted_info: Dict with condition and keyword information
359
+
360
+ Returns:
361
+ Dict with confirmation status and options
362
+ """
363
+ # If no condition found, request user to rephrase
364
+ if not extracted_info.get('condition'):
365
+ return {
366
+ 'type': 'rephrase_needed',
367
+ 'message': "Could not identify a specific medical condition. Please rephrase your query.",
368
+ 'suggestions': [
369
+ "Try: 'how to treat chest pain'",
370
+ "Try: 'acute stroke management'",
371
+ "Try: 'pulmonary embolism treatment'"
372
+ ]
373
+ }
374
+
375
+ # Prepare confirmation message
376
+ confirmation_message = f"""
377
+ I understand you're asking about: "{extracted_info.get('condition', 'Unknown Condition')}"
378
+
379
+ Extracted Keywords:
380
+ - Emergency: {extracted_info.get('emergency_keywords', 'None')}
381
+ - Treatment: {extracted_info.get('treatment_keywords', 'None')}
382
+
383
+ Please confirm:
384
+ 1) Yes, proceed with search
385
+ 2) No, please rephrase my query
386
+ 3) Modify keywords
387
+ """
388
+
389
+ return {
390
+ 'type': 'confirmation_needed',
391
+ 'message': confirmation_message,
392
+ 'extracted_info': extracted_info
393
+ }
394
+
395
+ def _handle_matching_failure_level1(self, condition: str) -> Optional[Dict[str, Any]]:
396
+ """
397
+ Level 1 Fallback: Loose keyword matching for medical conditions
398
+
399
+ Args:
400
+ condition: The condition to match loosely
401
+
402
+ Returns:
403
+ Dict with matched keywords or None
404
+ """
405
+ # Predefined loose matching keywords for different medical domains
406
+ loose_medical_keywords = {
407
+ 'emergency': [
408
+ 'urgent', 'critical', 'severe', 'acute',
409
+ 'immediate', 'life-threatening', 'emergency'
410
+ ],
411
+ 'treatment': [
412
+ 'manage', 'cure', 'heal', 'recover',
413
+ 'therapy', 'medication', 'intervention'
414
+ ]
415
+ }
416
+
417
+ # Normalize condition
418
+ condition_lower = condition.lower().strip()
419
+
420
+ # Check emergency keywords
421
+ emergency_matches = [
422
+ kw for kw in loose_medical_keywords['emergency']
423
+ if kw in condition_lower
424
+ ]
425
+
426
+ # Check treatment keywords
427
+ treatment_matches = [
428
+ kw for kw in loose_medical_keywords['treatment']
429
+ if kw in condition_lower
430
+ ]
431
+
432
+ # If matches found, return result
433
+ if emergency_matches or treatment_matches:
434
+ logger.info(f"Loose keyword match for condition: {condition}")
435
+ return {
436
+ 'type': 'loose_keyword_match',
437
+ 'condition': condition,
438
+ 'emergency_keywords': '|'.join(emergency_matches),
439
+ 'treatment_keywords': '|'.join(treatment_matches),
440
+ 'confidence': 0.5 # Lower confidence due to loose matching
441
+ }
442
+
443
+ # No loose matches found
444
+ logger.info(f"No loose keyword match for condition: {condition}")
445
+ return None
446
+
447
+ def validate_medical_query(self, user_query: str) -> Dict[str, Any]:
448
+ """
449
+ Validate if the query is a medical-related query using Llama3-Med42-70B multi-layer verification
450
+
451
+ Args:
452
+ user_query: User's input query
453
+
454
+ Returns:
455
+ Dict with validation result or None if medical query
456
+ """
457
+ # Expanded medical keywords covering comprehensive medical terminology
458
+ predefined_medical_keywords = {
459
+ # Symptoms and signs
460
+ 'pain', 'symptom', 'ache', 'fever', 'inflammation',
461
+ 'bleeding', 'swelling', 'rash', 'bruise', 'wound',
462
+
463
+ # Medical professional terms
464
+ 'disease', 'condition', 'syndrome', 'disorder',
465
+ 'medical', 'health', 'diagnosis', 'treatment',
466
+ 'therapy', 'medication', 'prescription',
467
+
468
+ # Body systems and organs
469
+ 'heart', 'lung', 'brain', 'kidney', 'liver',
470
+ 'blood', 'nerve', 'muscle', 'bone', 'joint',
471
+
472
+ # Medical actions
473
+ 'examine', 'check', 'test', 'scan', 'surgery',
474
+ 'operation', 'emergency', 'urgent', 'critical',
475
+
476
+ # Specific medical fields
477
+ 'cardiology', 'neurology', 'oncology', 'pediatrics',
478
+ 'psychiatry', 'dermatology', 'orthopedics'
479
+ }
480
+
481
+ # Check if query contains predefined medical keywords
482
+ query_lower = user_query.lower()
483
+ if any(kw in query_lower for kw in predefined_medical_keywords):
484
+ return None # Validated by predefined keywords
485
+
486
+ try:
487
+ # Ensure Llama3-Med42-70B client is properly initialized
488
+ if not hasattr(self, 'llm_client') or self.llm_client is None:
489
+ self.logger.warning("Llama3-Med42-70B client not initialized")
490
+ return self._generate_invalid_query_response()
491
+
492
+ # Use Llama3-Med42-70B for final medical query determination
493
+ llama_result = self.llm_client.analyze_medical_query(
494
+ query=user_query,
495
+ max_tokens=100 # Limit tokens for efficiency
496
+ )
497
+
498
+ # If Llama3-Med42-70B successfully extracts a medical condition
499
+ if llama_result.get('extracted_condition'):
500
+ return None # Validated by Llama3-Med42-70B
501
+
502
+ except Exception as e:
503
+ # Log Llama3-Med42-70B analysis failure without blocking the process
504
+ self.logger.warning(f"Llama3-Med42-70B query validation failed: {e}")
505
+
506
+ # If no medical relevance is found
507
+ return self._generate_invalid_query_response()
508
+
509
+ def _generate_invalid_query_response(self) -> Dict[str, Any]:
510
+ """
511
+ Generate response for non-medical queries
512
+
513
+ Returns:
514
+ Dict with invalid query guidance
515
+ """
516
+ return {
517
+ 'type': 'invalid_query',
518
+ 'message': "This is OnCall.AI, a clinical medical assistance platform. "
519
+ "Please input a medical problem you need help resolving. "
520
+ "\n\nExamples:\n"
521
+ "- 'I'm experiencing chest pain'\n"
522
+ "- 'What are symptoms of stroke?'\n"
523
+ "- 'How to manage acute asthma?'\n"
524
+ "- 'I have a persistent headache'"
525
+ }
526
+
527
+ def main():
528
+ """
529
+ Example usage and testing of UserPromptProcessor with Llama3-Med42-70B
530
+ Demonstrates condition extraction and query validation
531
+ """
532
+ from .retrieval import BasicRetrievalSystem
533
+
534
+ # use relative import to avoid circular import
535
+ from .llm_clients import llm_Med42_70BClient
536
+
537
+ # Initialize LLM client
538
+ llm_client = llm_Med42_70BClient()
539
+ retrieval_system = BasicRetrievalSystem()
540
+
541
+ # Initialize UserPromptProcessor with the LLM client
542
+ processor = UserPromptProcessor(
543
+ llm_client=llm_client, retrieval_system=retrieval_system
544
+ )
545
+
546
+ # Update test cases with more representative medical queries
547
+ test_queries = [
548
+ "patient with severe chest pain and shortness of breath",
549
+ "sudden neurological symptoms suggesting stroke",
550
+ "persistent headache with vision changes"
551
+ ]
552
+
553
+ for query in test_queries:
554
+ print(f"\nQuery: {query}")
555
+ result = processor.extract_condition_keywords(query)
556
+ print("Extracted Keywords:", result)
557
+
558
+ confirmation = processor.handle_user_confirmation(result)
559
+ print("Confirmation:", confirmation['message'])
560
+
561
+ if __name__ == "__main__":
562
+ main()
test_retrieval_pipeline.py ADDED
@@ -0,0 +1,223 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Test script for OnCall.ai retrieval pipeline
4
+
5
+ This script tests the complete flow:
6
+ user_input → user_prompt.py → retrieval.py
7
+
8
+ Author: OnCall.ai Team
9
+ Date: 2025-07-30
10
+ """
11
+
12
+ import sys
13
+ import os
14
+ from pathlib import Path
15
+ import logging
16
+ import json
17
+ from datetime import datetime
18
+
19
+ # Add src directory to Python path
20
+ sys.path.append(os.path.join(os.path.dirname(__file__), 'src'))
21
+
22
+ # Import our modules
23
+ from user_prompt import UserPromptProcessor
24
+ from retrieval import BasicRetrievalSystem
25
+ from llm_clients import llm_Med42_70BClient
26
+
27
+ # Configure logging
28
+ logging.basicConfig(
29
+ level=logging.INFO,
30
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
31
+ handlers=[
32
+ logging.StreamHandler(),
33
+ logging.FileHandler('test_retrieval_pipeline.log')
34
+ ]
35
+ )
36
+ logger = logging.getLogger(__name__)
37
+
38
+ def test_retrieval_pipeline():
39
+ """
40
+ Test the complete retrieval pipeline
41
+ """
42
+ print("="*60)
43
+ print("OnCall.ai Retrieval Pipeline Test")
44
+ print("="*60)
45
+ print(f"Test started at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
46
+ print()
47
+
48
+ try:
49
+ # Initialize components
50
+ print("🔧 Initializing components...")
51
+
52
+ # Initialize LLM client
53
+ llm_client = llm_Med42_70BClient()
54
+ print("✅ LLM client initialized")
55
+
56
+ # Initialize retrieval system
57
+ retrieval_system = BasicRetrievalSystem()
58
+ print("✅ Retrieval system initialized")
59
+
60
+ # Initialize user prompt processor
61
+ user_prompt_processor = UserPromptProcessor(
62
+ llm_client=llm_client,
63
+ retrieval_system=retrieval_system
64
+ )
65
+ print("✅ User prompt processor initialized")
66
+ print()
67
+
68
+ # Test queries
69
+ test_queries = [
70
+ "how to treat acute MI?",
71
+ "patient with chest pain and shortness of breath",
72
+ "sudden neurological symptoms suggesting stroke",
73
+ "acute stroke management protocol"
74
+ ]
75
+
76
+ results = []
77
+
78
+ for i, query in enumerate(test_queries, 1):
79
+ print(f"🔍 Test {i}/{len(test_queries)}: Testing query: '{query}'")
80
+ print("-" * 50)
81
+
82
+ try:
83
+ # Step 1: Extract condition keywords
84
+ print("Step 1: Extracting condition keywords...")
85
+ condition_result = user_prompt_processor.extract_condition_keywords(query)
86
+
87
+ print(f" Condition: {condition_result.get('condition', 'None')}")
88
+ print(f" Emergency keywords: {condition_result.get('emergency_keywords', 'None')}")
89
+ print(f" Treatment keywords: {condition_result.get('treatment_keywords', 'None')}")
90
+
91
+ if not condition_result.get('condition'):
92
+ print(" ⚠️ No condition extracted, skipping retrieval")
93
+ continue
94
+
95
+ # Step 2: User confirmation (simulated)
96
+ print("\nStep 2: User confirmation (simulated as 'yes')")
97
+ confirmation = user_prompt_processor.handle_user_confirmation(condition_result)
98
+ print(f" Confirmation type: {confirmation.get('type', 'Unknown')}")
99
+
100
+ # Step 3: Perform retrieval
101
+ print("\nStep 3: Performing retrieval...")
102
+ search_query = f"{condition_result.get('emergency_keywords', '')} {condition_result.get('treatment_keywords', '')}".strip()
103
+
104
+ if not search_query:
105
+ search_query = condition_result.get('condition', query)
106
+
107
+ print(f" Search query: '{search_query}'")
108
+
109
+ retrieval_results = retrieval_system.search(search_query, top_k=5)
110
+
111
+ # Display results
112
+ print(f"\n📊 Retrieval Results:")
113
+ print(f" Total results: {retrieval_results.get('total_results', 0)}")
114
+
115
+ emergency_results = retrieval_results.get('emergency_results', [])
116
+ treatment_results = retrieval_results.get('treatment_results', [])
117
+
118
+ print(f" Emergency results: {len(emergency_results)}")
119
+ print(f" Treatment results: {len(treatment_results)}")
120
+
121
+ # Show top results
122
+ if 'processed_results' in retrieval_results:
123
+ processed_results = retrieval_results['processed_results'][:3] # Show top 3
124
+ print(f"\n Top {len(processed_results)} results:")
125
+ for j, result in enumerate(processed_results, 1):
126
+ print(f" {j}. Type: {result.get('type', 'Unknown')}")
127
+ print(f" Distance: {result.get('distance', 'Unknown'):.4f}")
128
+ print(f" Text preview: {result.get('text', '')[:100]}...")
129
+ print(f" Matched: {result.get('matched', 'None')}")
130
+ print(f" Treatment matched: {result.get('matched_treatment', 'None')}")
131
+ print()
132
+
133
+ # Store results for summary
134
+ test_result = {
135
+ 'query': query,
136
+ 'condition_extracted': condition_result.get('condition', ''),
137
+ 'emergency_keywords': condition_result.get('emergency_keywords', ''),
138
+ 'treatment_keywords': condition_result.get('treatment_keywords', ''),
139
+ 'search_query': search_query,
140
+ 'total_results': retrieval_results.get('total_results', 0),
141
+ 'emergency_count': len(emergency_results),
142
+ 'treatment_count': len(treatment_results),
143
+ 'success': True
144
+ }
145
+ results.append(test_result)
146
+
147
+ print("✅ Test completed successfully")
148
+
149
+ except Exception as e:
150
+ logger.error(f"Error in test {i}: {e}", exc_info=True)
151
+ test_result = {
152
+ 'query': query,
153
+ 'error': str(e),
154
+ 'success': False
155
+ }
156
+ results.append(test_result)
157
+ print(f"❌ Test failed: {e}")
158
+
159
+ print("\n" + "="*60 + "\n")
160
+
161
+ # Print summary
162
+ print_test_summary(results)
163
+
164
+ # Save results to file
165
+ save_test_results(results)
166
+
167
+ return results
168
+
169
+ except Exception as e:
170
+ logger.error(f"Critical error in pipeline test: {e}", exc_info=True)
171
+ print(f"❌ Critical error: {e}")
172
+ return []
173
+
174
+ def print_test_summary(results):
175
+ """Print test summary"""
176
+ print("📋 TEST SUMMARY")
177
+ print("="*60)
178
+
179
+ successful_tests = [r for r in results if r.get('success', False)]
180
+ failed_tests = [r for r in results if not r.get('success', False)]
181
+
182
+ print(f"Total tests: {len(results)}")
183
+ print(f"Successful: {len(successful_tests)}")
184
+ print(f"Failed: {len(failed_tests)}")
185
+ print(f"Success rate: {len(successful_tests)/len(results)*100:.1f}%")
186
+ print()
187
+
188
+ if successful_tests:
189
+ print("✅ Successful tests:")
190
+ for result in successful_tests:
191
+ print(f" - '{result['query']}'")
192
+ print(f" Condition: {result.get('condition_extracted', 'None')}")
193
+ print(f" Results: {result.get('total_results', 0)} total "
194
+ f"({result.get('emergency_count', 0)} emergency, "
195
+ f"{result.get('treatment_count', 0)} treatment)")
196
+ print()
197
+
198
+ if failed_tests:
199
+ print("❌ Failed tests:")
200
+ for result in failed_tests:
201
+ print(f" - '{result['query']}': {result.get('error', 'Unknown error')}")
202
+ print()
203
+
204
+ def save_test_results(results):
205
+ """Save test results to JSON file"""
206
+ timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
207
+ filename = f"test_results_{timestamp}.json"
208
+
209
+ try:
210
+ with open(filename, 'w', encoding='utf-8') as f:
211
+ json.dump({
212
+ 'timestamp': datetime.now().isoformat(),
213
+ 'test_results': results
214
+ }, f, indent=2, ensure_ascii=False)
215
+
216
+ print(f"📁 Test results saved to: {filename}")
217
+
218
+ except Exception as e:
219
+ logger.error(f"Failed to save test results: {e}")
220
+ print(f"⚠️ Failed to save test results: {e}")
221
+
222
+ if __name__ == "__main__":
223
+ test_retrieval_pipeline()
tests/requirements.txt ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiofiles==24.1.0
2
+ aiohappyeyeballs==2.6.1
3
+ aiohttp==3.12.14
4
+ aiosignal==1.4.0
5
+ annotated-types==0.7.0
6
+ annoy==1.17.3
7
+ anyio==4.9.0
8
+ attrs==25.3.0
9
+ Brotli==1.1.0
10
+ certifi==2025.7.14
11
+ charset-normalizer==3.4.2
12
+ click==8.2.1
13
+ contourpy==1.3.2
14
+ cycler==0.12.1
15
+ datasets==4.0.0
16
+ dill==0.3.8
17
+ distro==1.9.0
18
+ fastapi==0.116.1
19
+ ffmpy==0.6.0
20
+ filelock==3.18.0
21
+ fonttools==4.59.0
22
+ frozenlist==1.7.0
23
+ fsspec==2025.3.0
24
+ gradio==5.38.0
25
+ gradio_client==1.11.0
26
+ groovy==0.1.2
27
+ h11==0.16.0
28
+ hf-xet==1.1.5
29
+ httpcore==1.0.9
30
+ httpx==0.28.1
31
+ huggingface-hub==0.33.4
32
+ idna==3.10
33
+ iniconfig==2.1.0
34
+ Jinja2==3.1.6
35
+ jiter==0.10.0
36
+ joblib==1.5.1
37
+ kiwisolver==1.4.8
38
+ markdown-it-py==3.0.0
39
+ MarkupSafe==3.0.2
40
+ matplotlib==3.10.3
41
+ mdurl==0.1.2
42
+ mpmath==1.3.0
43
+ multidict==6.6.3
44
+ multiprocess==0.70.16
45
+ networkx==3.5
46
+ numpy==2.3.1
47
+ openai==1.97.0
48
+ orjson==3.11.0
49
+ packaging==25.0
50
+ pandas==2.3.1
51
+ pillow==11.3.0
52
+ pluggy==1.6.0
53
+ propcache==0.3.2
54
+ pyarrow==20.0.0
55
+ pydantic==2.11.7
56
+ pydantic_core==2.33.2
57
+ pydub==0.25.1
58
+ Pygments==2.19.2
59
+ pyparsing==3.2.3
60
+ pytest==8.4.1
61
+ python-dateutil==2.9.0.post0
62
+ python-multipart==0.0.20
63
+ pytz==2025.2
64
+ PyYAML==6.0.2
65
+ regex==2024.11.6
66
+ requests==2.32.4
67
+ rich==14.0.0
68
+ ruff==0.12.4
69
+ safehttpx==0.1.6
70
+ safetensors==0.5.3
71
+ scikit-learn==1.7.1
72
+ scipy==1.16.1
73
+ seaborn==0.13.2
74
+ semantic-version==2.10.0
75
+ sentence-transformers==3.0.1
76
+ shellingham==1.5.4
77
+ six==1.17.0
78
+ sniffio==1.3.1
79
+ starlette==0.47.2
80
+ sympy==1.14.0
81
+ threadpoolctl==3.6.0
82
+ tokenizers==0.21.2
83
+ tomlkit==0.13.3
84
+ torch==2.7.1
85
+ tqdm==4.67.1
86
+ transformers==4.53.2
87
+ typer==0.16.0
88
+ typing-inspection==0.4.1
89
+ typing_extensions==4.14.1
90
+ tzdata==2025.2
91
+ urllib3==2.5.0
92
+ uvicorn==0.35.0
93
+ websockets==15.0.1
94
+ xxhash==3.5.0
95
+ yarl==1.20.1
tests/result_of_test_end_to_end_pipeline.md ADDED
The diff for this file is too large to render. See raw diff
 
tests/result_of_test_multlevel_fallback_validation.md ADDED
@@ -0,0 +1,570 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 🏥 OnCall.ai Multilevel Fallback Validation Test
2
+ ============================================================
3
+ 🔧 Initializing Components for Multilevel Fallback Test...
4
+ ------------------------------------------------------------
5
+ 1. Initializing Llama3-Med42-70B Client...
6
+ 2025-07-31 07:12:17,625 - llm_clients - INFO - Medical LLM client initialized with model: m42-health/Llama3-Med42-70B
7
+ 2025-07-31 07:12:17,626 - llm_clients - WARNING - Medical LLM Model: Research tool only. Not for professional medical diagnosis.
8
+ ✅ LLM client initialized
9
+ 2. Initializing Retrieval System...
10
+ 2025-07-31 07:12:17,626 - retrieval - INFO - Initializing retrieval system...
11
+ 2025-07-31 07:12:17,637 - sentence_transformers.SentenceTransformer - INFO - Use pytorch device_name: mps
12
+ 2025-07-31 07:12:17,637 - sentence_transformers.SentenceTransformer - INFO - Load pretrained SentenceTransformer: NeuML/pubmedbert-base-embeddings
13
+ 2025-07-31 07:12:20,936 - retrieval - INFO - Embedding model loaded successfully
14
+ 2025-07-31 07:12:22,314 - retrieval - INFO - Chunks loaded successfully
15
+ 2025-07-31 07:12:22,418 - retrieval - INFO - Embeddings loaded successfully
16
+ 2025-07-31 07:12:22,419 - retrieval - INFO - Loaded existing emergency index
17
+ 2025-07-31 07:12:22,420 - retrieval - INFO - Loaded existing treatment index
18
+ 2025-07-31 07:12:22,420 - retrieval - INFO - Retrieval system initialized successfully
19
+ ✅ Retrieval system initialized
20
+ 3. Initializing User Prompt Processor...
21
+ 2025-07-31 07:12:22,420 - sentence_transformers.SentenceTransformer - INFO - Use pytorch device_name: mps
22
+ 2025-07-31 07:12:22,420 - sentence_transformers.SentenceTransformer - INFO - Load pretrained SentenceTransformer: NeuML/pubmedbert-base-embeddings
23
+ 2025-07-31 07:12:24,622 - user_prompt - INFO - UserPromptProcessor initialized
24
+ ✅ User prompt processor initialized
25
+
26
+ 🎉 All components initialized successfully!
27
+
28
+ 🚀 Starting Multilevel Fallback Test Suite
29
+ Total test cases: 13
30
+ Test started at: 2025-07-31 07:12:17
31
+ ================================================================================
32
+
33
+ 🔍 level1_001: Level 1: Direct predefined condition match
34
+ Query: 'acute myocardial infarction treatment'
35
+ Expected Level: 1
36
+ ----------------------------------------------------------------------
37
+ 🎯 Executing multilevel fallback...
38
+ 2025-07-31 07:12:24,623 - user_prompt - INFO - Matched predefined condition: acute myocardial infarction
39
+ ✅ Detected Level: 1
40
+ Condition: acute myocardial infarction
41
+ Emergency Keywords: MI|chest pain|cardiac arrest
42
+ Treatment Keywords: aspirin|nitroglycerin|thrombolytic|PCI
43
+ Execution Time: 0.000s
44
+ 🎉 Test PASSED - Expected behavior achieved
45
+
46
+ 🔍 level1_002: Level 1: Predefined stroke condition
47
+ Query: 'how to manage acute stroke?'
48
+ Expected Level: 1
49
+ ----------------------------------------------------------------------
50
+ 🎯 Executing multilevel fallback...
51
+ 2025-07-31 07:12:24,623 - user_prompt - INFO - Matched predefined condition: acute stroke
52
+ ✅ Detected Level: 1
53
+ Condition: acute stroke
54
+ Emergency Keywords: stroke|neurological deficit|sudden weakness
55
+ Treatment Keywords: tPA|thrombolysis|stroke unit care
56
+ Execution Time: 0.000s
57
+ 🎉 Test PASSED - Expected behavior achieved
58
+
59
+ 🔍 level1_003: Level 1: Predefined PE condition
60
+ Query: 'pulmonary embolism emergency protocol'
61
+ Expected Level: 1
62
+ ----------------------------------------------------------------------
63
+ 🎯 Executing multilevel fallback...
64
+ 2025-07-31 07:12:24,623 - user_prompt - INFO - Matched predefined condition: pulmonary embolism
65
+ ✅ Detected Level: 1
66
+ Condition: pulmonary embolism
67
+ Emergency Keywords: chest pain|shortness of breath|sudden dyspnea
68
+ Treatment Keywords: anticoagulation|heparin|embolectomy
69
+ Execution Time: 0.000s
70
+ 🎉 Test PASSED - Expected behavior achieved
71
+
72
+ 🔍 level2_001: Level 2: Symptom-based query requiring LLM analysis
73
+ Query: 'patient with severe crushing chest pain radiating to left arm'
74
+ Expected Level: 2
75
+ ----------------------------------------------------------------------
76
+ 🎯 Executing multilevel fallback...
77
+ 2025-07-31 07:12:24,623 - llm_clients - INFO - Calling Medical LLM with query: patient with severe crushing chest pain radiating to left arm
78
+ 2025-07-31 07:12:47,629 - llm_clients - INFO - Raw LLM Response: Acute Myocardial Infarction (STEMI) - considering "severe crushing chest pain" and radiation to the left arm, which are classic symptoms of a heart attack specifically involving ST-elevation (STEMI type), indicating complete blockage of a coronary artery. However, please note that as an AI assistant, I don't diagnose; this interpretation is based on common clinical presentation. A healthcare provider should perform an ECG and other tests for confirmation.
79
+ 2025-07-31 07:12:47,630 - llm_clients - INFO - Query Latency: 23.0064 seconds
80
+ 2025-07-31 07:12:47,630 - llm_clients - INFO - Extracted Condition: acute myocardial infarction
81
+ ✅ Detected Level: 1
82
+ Condition: acute myocardial infarction
83
+ Emergency Keywords: MI|chest pain|cardiac arrest
84
+ Treatment Keywords: aspirin|nitroglycerin|thrombolytic|PCI
85
+ Execution Time: 23.008s
86
+ 🎉 Test PASSED - Expected behavior achieved
87
+
88
+ 🔍 level2_002: Level 2: Neurological symptoms requiring LLM
89
+ Query: 'sudden onset weakness on right side with speech difficulty'
90
+ Expected Level: 2
91
+ ----------------------------------------------------------------------
92
+ 🎯 Executing multilevel fallback...
93
+ 2025-07-31 07:12:47,631 - llm_clients - INFO - Calling Medical LLM with query: sudden onset weakness on right side with speech difficulty
94
+ 2025-07-31 07:12:56,760 - llm_clients - INFO - Raw LLM Response: Cerebrovascular Accident (CVA), or Acute Ischemic Stroke (specifically, with right hemiparesis and aphasia)
95
+
96
+ - This diagnosis represents the most likely condition given the sudden onset of right-sided weakness (hemiparesis) and speech difficulty (aphasia). An ischemic stroke occurs when blood flow to a part of the brain is blocked, typically by a thrombus or embolus, causing damage to brain tissue and resulting in neurological deficits. Immediate medical
97
+ 2025-07-31 07:12:56,760 - llm_clients - INFO - Query Latency: 9.1288 seconds
98
+ 2025-07-31 07:12:56,760 - llm_clients - INFO - Extracted Condition: Cerebrovascular Accident (CVA), or Acute Ischemic Stroke (specifically, with right hemiparesis and aphasia)
99
+ 2025-07-31 07:12:56,760 - user_prompt - INFO - Starting semantic search fallback for query: 'sudden onset weakness on right side with speech difficulty'
100
+ Batches: 100%|██████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 1.66it/s]
101
+ 2025-07-31 07:12:58,013 - retrieval - INFO - Sliding window search: Found 5 results
102
+ 2025-07-31 07:12:58,023 - user_prompt - INFO - Semantic search returned 5 results
103
+ Batches: 100%|██████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 13.88it/s]
104
+ Batches: 100%|██████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 17.77it/s]
105
+ Batches: 100%|██████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 17.88it/s]
106
+ Batches: 100%|██████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 62.68it/s]
107
+ Batches: 100%|██████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 17.51it/s]
108
+ Batches: 100%|██████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 57.08it/s]
109
+ Batches: 100%|██████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 60.75it/s]
110
+ Batches: 100%|██████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 63.98it/s]
111
+ 2025-07-31 07:12:58,342 - user_prompt - INFO - Inferred condition: None
112
+ 2025-07-31 07:12:58,342 - user_prompt - WARNING - Condition validation failed for: None
113
+ 2025-07-31 07:12:58,342 - user_prompt - INFO - No suitable condition found in semantic search
114
+ 2025-07-31 07:12:58,342 - llm_clients - INFO - Calling Medical LLM with query: sudden onset weakness on right side with speech difficulty
115
+ 2025-07-31 07:13:09,255 - llm_clients - INFO - Raw LLM Response: Cerebrovascular Accident (CVA), or Acute Ischemic Stroke (specifically, with right hemiparesis and aphasia)
116
+
117
+ - This diagnosis represents the most likely condition given the sudden onset of right-sided weakness (hemiparesis) and speech difficulty (aphasia), which are classic symptoms of an ischemic stroke affecting the dominant hemisphere (assuming the patient is right-handed).
118
+
119
+ Please note that only a qualified physician can confirm a diagnosis after a thorough evaluation, including imaging studies
120
+ 2025-07-31 07:13:09,255 - llm_clients - INFO - Query Latency: 10.9129 seconds
121
+ 2025-07-31 07:13:09,255 - llm_clients - INFO - Extracted Condition: Cerebrovascular Accident (CVA), or Acute Ischemic Stroke (specifically, with right hemiparesis and aphasia)
122
+ Batches: 100%|██████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 8.55it/s]
123
+ 2025-07-31 07:13:09,844 - retrieval - INFO - Sliding window search: Found 5 results
124
+ ✅ Detected Level: 5
125
+ Condition: generic medical query
126
+ Emergency Keywords: medical|emergency
127
+ Treatment Keywords: treatment|management
128
+ Execution Time: 22.223s
129
+ ⚠️ Test PARTIAL - ⚠️ Level 5 != expected 2. ⚠️ Condition 'generic medical query' != expected ['acute stroke', 'cerebrovascular accident'].
130
+
131
+ 🔍 level3_001: Level 3: Generic medical terms requiring semantic search
132
+ Query: 'emergency management of cardiovascular crisis'
133
+ Expected Level: 3
134
+ ----------------------------------------------------------------------
135
+ 🎯 Executing multilevel fallback...
136
+ 2025-07-31 07:13:09,854 - llm_clients - INFO - Calling Medical LLM with query: emergency management of cardiovascular crisis
137
+ 2025-07-31 07:13:20,094 - llm_clients - INFO - Raw LLM Response: Cardiac Arrest (or, in context of crisis not yet arrest: Acute Cardiogenic Emergency, e.g., STEMI)
138
+
139
+ - Note: As a text-based AI assistant, not a clinician, I don't provide medical advice. The term given here represents the most critical cardiovascular crisis requiring immediate emergency intervention. Cardiac arrest implies the heart has stopped pumping, while acute cardiogenic emergency (e.g., ST-elevation myocardial infarction, or STEMI) signifies severe heart
140
+ 2025-07-31 07:13:20,095 - llm_clients - INFO - Query Latency: 10.2402 seconds
141
+ 2025-07-31 07:13:20,095 - llm_clients - INFO - Extracted Condition: Cardiac Arrest (or, in context of crisis not yet arrest: Acute Cardiogenic Emergency, e.g., STEMI)
142
+ 2025-07-31 07:13:20,095 - user_prompt - INFO - Starting semantic search fallback for query: 'emergency management of cardiovascular crisis'
143
+ Batches: 100%|██████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 15.11it/s]
144
+ 2025-07-31 07:13:20,681 - retrieval - INFO - Sliding window search: Found 5 results
145
+ 2025-07-31 07:13:20,713 - user_prompt - INFO - Semantic search returned 5 results
146
+ Batches: 100%|██████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 14.75it/s]
147
+ Batches: 100%|██████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 59.28it/s]
148
+ Batches: 100%|██████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 56.29it/s]
149
+ Batches: 100%|██████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 62.79it/s]
150
+ Batches: 100%|██████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 65.12it/s]
151
+ Batches: 100%|██████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 62.44it/s]
152
+ Batches: 100%|██████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 61.88it/s]
153
+ Batches: 100%|██████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 62.20it/s]
154
+ 2025-07-31 07:13:20,905 - user_prompt - INFO - Inferred condition: None
155
+ 2025-07-31 07:13:20,905 - user_prompt - WARNING - Condition validation failed for: None
156
+ 2025-07-31 07:13:20,905 - user_prompt - INFO - No suitable condition found in semantic search
157
+ Batches: 100%|██████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 15.96it/s]
158
+ 2025-07-31 07:13:21,492 - retrieval - INFO - Sliding window search: Found 5 results
159
+ ✅ Detected Level: 5
160
+ Condition: generic medical query
161
+ Emergency Keywords: medical|emergency
162
+ Treatment Keywords: treatment|management
163
+ Execution Time: 11.647s
164
+ ⚠️ Test PARTIAL - ⚠️ Level 5 != expected 3. ⚠️ Condition 'generic medical query' != expected [].
165
+
166
+ 🔍 level3_002: Level 3: Medical terminology requiring semantic fallback
167
+ Query: 'urgent neurological intervention protocols'
168
+ Expected Level: 3
169
+ ----------------------------------------------------------------------
170
+ 🎯 Executing multilevel fallback...
171
+ 2025-07-31 07:13:21,501 - llm_clients - INFO - Calling Medical LLM with query: urgent neurological intervention protocols
172
+ 2025-07-31 07:13:30,536 - llm_clients - INFO - Raw LLM Response: The most representative condition: Acute Ischemic Stroke (requiring urgent neurointervention, such as thrombectomy)
173
+
174
+ Explanation: The phrase "urgent neurological intervention protocols" typically refers to time-critical situations in neurology, and among these, acute ischemic stroke is a prime example. Acute ischemic stroke necessitates rapid evaluation and intervention, including thrombectomy, to restore blood flow and minimize brain damage. This condition demands urgent action due to its narrow therapeutic window, typically within
175
+ 2025-07-31 07:13:30,537 - llm_clients - INFO - Query Latency: 9.0352 seconds
176
+ 2025-07-31 07:13:30,537 - llm_clients - INFO - Extracted Condition: The most representative condition: Acute Ischemic Stroke (requiring urgent neurointervention, such as thrombectomy)
177
+ 2025-07-31 07:13:30,537 - user_prompt - INFO - Starting semantic search fallback for query: 'urgent neurological intervention protocols'
178
+ Batches: 100%|██████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 7.94it/s]
179
+ 2025-07-31 07:13:31,115 - retrieval - INFO - Sliding window search: Found 5 results
180
+ 2025-07-31 07:13:31,123 - user_prompt - INFO - Semantic search returned 5 results
181
+ Batches: 100%|██████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 13.96it/s]
182
+ Batches: 100%|██████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 46.55it/s]
183
+ Batches: 100%|██████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 47.09it/s]
184
+ Batches: 100%|██████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 58.23it/s]
185
+ Batches: 100%|██████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 48.16it/s]
186
+ Batches: 100%|██████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 65.05it/s]
187
+ Batches: 100%|██████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 60.42it/s]
188
+ Batches: 100%|██████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 63.08it/s]
189
+ 2025-07-31 07:13:31,334 - user_prompt - INFO - Inferred condition: None
190
+ 2025-07-31 07:13:31,334 - user_prompt - WARNING - Condition validation failed for: None
191
+ 2025-07-31 07:13:31,334 - user_prompt - INFO - No suitable condition found in semantic search
192
+ Batches: 100%|██████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 16.31it/s]
193
+ 2025-07-31 07:13:31,889 - retrieval - INFO - Sliding window search: Found 5 results
194
+ ✅ Detected Level: 5
195
+ Condition: generic medical query
196
+ Emergency Keywords: medical|emergency
197
+ Treatment Keywords: treatment|management
198
+ Execution Time: 10.398s
199
+ ⚠️ Test PARTIAL - ⚠️ Level 5 != expected 3. ⚠️ Condition 'generic medical query' != expected [].
200
+
201
+ 🔍 level4a_001: Level 4a: Non-medical query should be rejected
202
+ Query: 'how to cook pasta properly?'
203
+ Expected Level: 4
204
+ ----------------------------------------------------------------------
205
+ 🎯 Executing multilevel fallback...
206
+ 2025-07-31 07:13:31,899 - llm_clients - INFO - Calling Medical LLM with query: how to cook pasta properly?
207
+ 2025-07-31 07:13:41,038 - llm_clients - INFO - Raw LLM Response: As a medical assistant, I do not address cooking techniques, only medical conditions. However, for context (not advice): This query doesn't represent a medical condition; it's about culinary practice. In this case, "properly" cooking pasta typically means achieving al dente texture (not overly soft) by boiling in adequately salted water for the recommended time on the package, then draining well. This is unrelated to any health condition unless discussing, hypothetically, gastrointestinal tolerance in specific patients (e
208
+ 2025-07-31 07:13:41,038 - llm_clients - INFO - Query Latency: 9.1386 seconds
209
+ 2025-07-31 07:13:41,038 - llm_clients - INFO - Extracted Condition: As a medical assistant, I do not address cooking techniques, only medical conditions. However, for context (not advice): This query doesn't represent a medical condition; it's about culinary practice. In this case, "properly" cooking pasta typically means achieving al dente texture (not overly soft) by boiling in adequately salted water for the recommended time on the package, then draining well. This is unrelated to any health condition unless discussing, hypothetically, gastrointestinal tolerance in specific patients (e
210
+ 2025-07-31 07:13:41,038 - user_prompt - INFO - Starting semantic search fallback for query: 'how to cook pasta properly?'
211
+ Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 2.02it/s]
212
+ 2025-07-31 07:13:42,156 - retrieval - INFO - Sliding window search: Found 5 results
213
+ 2025-07-31 07:13:42,165 - user_prompt - INFO - Semantic search returned 5 results
214
+ Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 9.34it/s]
215
+ Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 52.88it/s]
216
+ Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 55.97it/s]
217
+ Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 55.95it/s]
218
+ Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 54.63it/s]
219
+ Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 59.07it/s]
220
+ Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 57.84it/s]
221
+ Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 60.43it/s]
222
+ 2025-07-31 07:13:42,407 - user_prompt - INFO - Inferred condition: None
223
+ 2025-07-31 07:13:42,407 - user_prompt - WARNING - Condition validation failed for: None
224
+ 2025-07-31 07:13:42,407 - user_prompt - INFO - No suitable condition found in semantic search
225
+ 2025-07-31 07:13:42,407 - llm_clients - INFO - Calling Medical LLM with query: how to cook pasta properly?
226
+ 2025-07-31 07:13:51,634 - llm_clients - INFO - Raw LLM Response: As a medical assistant, I don't address cooking techniques, but for context (not medical advice): In terms of relevance to health, the key aspect here isn't "proper" cooking per se, but rather avoiding overcooking that can reduce nutrient content. For whole-grain pasta, aim for al dente texture (firm, not mushy) to preserve fiber and other nutrients. However, this query is not about a medical condition but a culinary practice.
227
+ Representative (non-medical) term
228
+ 2025-07-31 07:13:51,634 - llm_clients - INFO - Query Latency: 9.2269 seconds
229
+ 2025-07-31 07:13:51,634 - llm_clients - INFO - Extracted Condition: As a medical assistant, I don't address cooking techniques, but for context (not medical advice): In terms of relevance to health, the key aspect here isn't "proper" cooking per se, but rather avoiding overcooking that can reduce nutrient content. For whole-grain pasta, aim for al dente texture (firm, not mushy) to preserve fiber and other nutrients. However, this query is not about a medical condition but a culinary practice.
230
+ Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 2.02it/s]
231
+ 2025-07-31 07:13:52,790 - retrieval - INFO - Sliding window search: Found 5 results
232
+ ✅ Detected Level: 5
233
+ Condition: generic medical query
234
+ Emergency Keywords: medical|emergency
235
+ Treatment Keywords: treatment|management
236
+ Execution Time: 20.900s
237
+ ⚠️ Test PARTIAL - ⚠️ Level 5 != expected 4. ⚠️ Query should have been rejected.
238
+
239
+ 🔍 level4a_002: Level 4a: Technology query should be rejected
240
+ Query: 'best programming language to learn in 2025'
241
+ Expected Level: 4
242
+ ----------------------------------------------------------------------
243
+ 🎯 Executing multilevel fallback...
244
+ 2025-07-31 07:13:52,799 - llm_clients - INFO - Calling Medical LLM with query: best programming language to learn in 2025
245
+ 2025-07-31 07:14:02,339 - llm_clients - INFO - Raw LLM Response: As a medical assistant, I do not address technology or education preferences like "best programming language" (which is non-medical context); however, for clarity, this query is outside my biomedical scope. In 2025 or any current year, the choice of "best" programming language is subjective and depends on industry trends, personal goals, and specific use cases (e.g., web development, mobile apps, or data science). Popular choices include Python, JavaScript, and Java, but it's crucial
246
+ 2025-07-31 07:14:02,339 - llm_clients - INFO - Query Latency: 9.5400 seconds
247
+ 2025-07-31 07:14:02,339 - llm_clients - INFO - Extracted Condition: As a medical assistant, I do not address technology or education preferences like "best programming language" (which is non-medical context); however, for clarity, this query is outside my biomedical scope. In 2025 or any current year, the choice of "best" programming language is subjective and depends on industry trends, personal goals, and specific use cases (e.g., web development, mobile apps, or data science). Popular choices include Python, JavaScript, and Java, but it's crucial
248
+ 2025-07-31 07:14:02,339 - user_prompt - INFO - Starting semantic search fallback for query: 'best programming language to learn in 2025'
249
+ Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 6.45it/s]
250
+ 2025-07-31 07:14:02,974 - retrieval - INFO - Sliding window search: Found 5 results
251
+ 2025-07-31 07:14:02,986 - user_prompt - INFO - Semantic search returned 5 results
252
+ Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 3.16it/s]
253
+ Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 44.42it/s]
254
+ Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 41.06it/s]
255
+ Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 57.97it/s]
256
+ Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 40.41it/s]
257
+ Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 57.85it/s]
258
+ Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 54.99it/s]
259
+ Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 55.63it/s]
260
+ 2025-07-31 07:14:03,457 - user_prompt - INFO - Inferred condition: None
261
+ 2025-07-31 07:14:03,457 - user_prompt - WARNING - Condition validation failed for: None
262
+ 2025-07-31 07:14:03,457 - user_prompt - INFO - No suitable condition found in semantic search
263
+ 2025-07-31 07:14:03,457 - llm_clients - INFO - Calling Medical LLM with query: best programming language to learn in 2025
264
+ 2025-07-31 07:14:13,766 - llm_clients - INFO - Raw LLM Response: As a medical assistant, I don't analyze technology trends or recommend programming languages; however, for clarity's sake (though out of my medical scope), in 2021 (not 2025's future prediction, as I'm bound by current data), popular choices for learning include Python, JavaScript, and Java due to their versatility, wide adoption, and job market demand. Keep in mind this information is not medical advice but rather a layman's interpretation of tech trends.
265
+
266
+ Representative Condition (not
267
+ 2025-07-31 07:14:13,766 - llm_clients - INFO - Query Latency: 10.3088 seconds
268
+ 2025-07-31 07:14:13,767 - llm_clients - INFO - Extracted Condition: As a medical assistant, I don't analyze technology trends or recommend programming languages; however, for clarity's sake (though out of my medical scope), in 2021 (not 2025's future prediction, as I'm bound by current data), popular choices for learning include Python, JavaScript, and Java due to their versatility, wide adoption, and job market demand. Keep in mind this information is not medical advice but rather a layman's interpretation of tech trends.
269
+ Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 2.00it/s]
270
+ 2025-07-31 07:14:14,884 - retrieval - INFO - Sliding window search: Found 5 results
271
+ ✅ Detected Level: 5
272
+ Condition: generic medical query
273
+ Emergency Keywords: medical|emergency
274
+ Treatment Keywords: treatment|management
275
+ Execution Time: 22.107s
276
+ ⚠️ Test PARTIAL - ⚠️ Level 5 != expected 4. ⚠️ Query should have been rejected.
277
+
278
+ 🔍 level4a_003: Level 4a: Weather query should be rejected
279
+ Query: 'weather forecast for tomorrow'
280
+ Expected Level: 4
281
+ ----------------------------------------------------------------------
282
+ 🎯 Executing multilevel fallback...
283
+ 2025-07-31 07:14:14,905 - llm_clients - INFO - Calling Medical LLM with query: weather forecast for tomorrow
284
+ 2025-07-31 07:14:24,069 - llm_clients - INFO - Raw LLM Response: As a medical assistant, I do not address weather forecasts; however, for context clarification, this query is unrelated to medical conditions. The requested information here is about meteorology (weather prediction) rather than health or disease. There's no representative medical condition to provide in this case.
285
+
286
+ If, however, you were referring indirectly to weather-sensitive health conditions (e.g., heat exhaustion, cold-induced asthma exacerbation), the specific condition would depend on the actual weather forecast details (temperature, humidity, etc.)
287
+ 2025-07-31 07:14:24,069 - llm_clients - INFO - Query Latency: 9.1634 seconds
288
+ 2025-07-31 07:14:24,069 - llm_clients - INFO - Extracted Condition: As a medical assistant, I do not address weather forecasts; however, for context clarification, this query is unrelated to medical conditions. The requested information here is about meteorology (weather prediction) rather than health or disease. There's no representative medical condition to provide in this case.
289
+ 2025-07-31 07:14:24,070 - user_prompt - INFO - Starting semantic search fallback for query: 'weather forecast for tomorrow'
290
+ Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 2.17it/s]
291
+ 2025-07-31 07:14:25,222 - retrieval - INFO - Sliding window search: Found 5 results
292
+ 2025-07-31 07:14:25,234 - user_prompt - INFO - Semantic search returned 5 results
293
+ Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 11.71it/s]
294
+ Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 50.65it/s]
295
+ Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 55.87it/s]
296
+ Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 50.21it/s]
297
+ Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 21.32it/s]
298
+ Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 54.77it/s]
299
+ Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 53.42it/s]
300
+ Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 56.34it/s]
301
+ 2025-07-31 07:14:25,491 - user_prompt - INFO - Inferred condition: None
302
+ 2025-07-31 07:14:25,491 - user_prompt - WARNING - Condition validation failed for: None
303
+ 2025-07-31 07:14:25,491 - user_prompt - INFO - No suitable condition found in semantic search
304
+ 2025-07-31 07:14:25,491 - llm_clients - INFO - Calling Medical LLM with query: weather forecast for tomorrow
305
+ 2025-07-31 07:14:35,356 - llm_clients - INFO - Raw LLM Response: As a medical assistant, I do not address weather forecasts; however, for this context (to maintain representativeness in terms unrelated to diagnosis), the phrase here isn't indicative of a medical condition. Instead, it's about environmental information—specifically, a request for meteorological data (tomorrow's weather). In medical terminology, we wouldn't classify this as a condition, but for representation's sake in a non-medical context, it can be labeled as "meteorological inquiry" or simply
306
+ 2025-07-31 07:14:35,356 - llm_clients - INFO - Query Latency: 9.8645 seconds
307
+ 2025-07-31 07:14:35,356 - llm_clients - INFO - Extracted Condition: As a medical assistant, I do not address weather forecasts; however, for this context (to maintain representativeness in terms unrelated to diagnosis), the phrase here isn't indicative of a medical condition. Instead, it's about environmental information—specifically, a request for meteorological data (tomorrow's weather). In medical terminology, we wouldn't classify this as a condition, but for representation's sake in a non-medical context, it can be labeled as "meteorological inquiry" or simply
308
+ Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 11.19it/s]
309
+ 2025-07-31 07:14:36,024 - retrieval - INFO - Sliding window search: Found 5 results
310
+ ✅ Detected Level: 5
311
+ Condition: generic medical query
312
+ Emergency Keywords: medical|emergency
313
+ Treatment Keywords: treatment|management
314
+ Execution Time: 21.128s
315
+ ⚠️ Test PARTIAL - ⚠️ Level 5 != expected 4. ⚠️ Query should have been rejected.
316
+
317
+ 🔍 level4b_001: Level 4b→5: Obscure medical query passing validation to generic search
318
+ Query: 'rare hematologic malignancy treatment approaches'
319
+ Expected Level: 5
320
+ ----------------------------------------------------------------------
321
+ 🎯 Executing multilevel fallback...
322
+ 2025-07-31 07:14:36,033 - llm_clients - INFO - Calling Medical LLM with query: rare hematologic malignancy treatment approaches
323
+ 2025-07-31 07:14:45,301 - llm_clients - INFO - Raw LLM Response: The most representative condition: Myelofibrosis (or, in context of "rare" reference, could be an even less common variant like BCR-ABL1-negative atypical CML or unclassifiable myeloproliferative neoplasm)
324
+
325
+ - For myelofibrosis, primary treatment approaches include JAK2 inhibitors (e.g., ruxolitinib), supportive care (transfusions, erythropoiesis-stimulating agents), and allog
326
+ 2025-07-31 07:14:45,302 - llm_clients - INFO - Query Latency: 9.2678 seconds
327
+ 2025-07-31 07:14:45,302 - llm_clients - INFO - Extracted Condition: The most representative condition: Myelofibrosis (or, in context of "rare" reference, could be an even less common variant like BCR-ABL1-negative atypical CML or unclassifiable myeloproliferative neoplasm)
328
+ 2025-07-31 07:14:45,302 - user_prompt - INFO - Starting semantic search fallback for query: 'rare hematologic malignancy treatment approaches'
329
+ Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 2.00it/s]
330
+ 2025-07-31 07:14:46,428 - retrieval - INFO - Sliding window search: Found 5 results
331
+ 2025-07-31 07:14:46,436 - user_prompt - INFO - Semantic search returned 5 results
332
+ Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 11.59it/s]
333
+ Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 38.61it/s]
334
+ Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 41.66it/s]
335
+ Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 61.40it/s]
336
+ Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 41.09it/s]
337
+ Batches: 100%|█████████████████████████████���████████████████████████████████████| 1/1 [00:00<00:00, 60.42it/s]
338
+ Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 58.98it/s]
339
+ Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 66.70it/s]
340
+ 2025-07-31 07:14:46,672 - user_prompt - INFO - Inferred condition: None
341
+ 2025-07-31 07:14:46,672 - user_prompt - WARNING - Condition validation failed for: None
342
+ 2025-07-31 07:14:46,672 - user_prompt - INFO - No suitable condition found in semantic search
343
+ Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 54.28it/s]
344
+ 2025-07-31 07:14:47,160 - retrieval - INFO - Sliding window search: Found 5 results
345
+ ✅ Detected Level: 5
346
+ Condition: generic medical query
347
+ Emergency Keywords: medical|emergency
348
+ Treatment Keywords: treatment|management
349
+ Execution Time: 11.137s
350
+ 🎉 Test PASSED - Expected behavior achieved
351
+
352
+ 🔍 level4b_002: Level 4b→5: Rare condition requiring generic medical search
353
+ Query: 'idiopathic thrombocytopenic purpura management guidelines'
354
+ Expected Level: 5
355
+ ----------------------------------------------------------------------
356
+ 🎯 Executing multilevel fallback...
357
+ 2025-07-31 07:14:47,170 - llm_clients - INFO - Calling Medical LLM with query: idiopathic thrombocytopenic purpura management guidelines
358
+ 2025-07-31 07:14:56,483 - llm_clients - INFO - Raw LLM Response: The primary medical condition: Idiopathic Thrombocytopenic Purpura (ITP)
359
+
360
+ (As a medical assistant, I do not provide advice, but here's the relevant condition with context for a knowledge reference.)
361
+ In this case, the most representative condition is Idiopathic Thrombocytopenic Purpura (ITP), an autoimmune disorder characterized by low platelet count (thrombocytopenia) without identifiable underlying causes. Management guidelines typically involve
362
+ 2025-07-31 07:14:56,484 - llm_clients - INFO - Query Latency: 9.3136 seconds
363
+ 2025-07-31 07:14:56,484 - llm_clients - INFO - Extracted Condition: The primary medical condition: Idiopathic Thrombocytopenic Purpura (ITP)
364
+ 2025-07-31 07:14:56,484 - user_prompt - INFO - Starting semantic search fallback for query: 'idiopathic thrombocytopenic purpura management guidelines'
365
+ Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 11.14it/s]
366
+ 2025-07-31 07:14:57,082 - retrieval - INFO - Sliding window search: Found 5 results
367
+ 2025-07-31 07:14:57,090 - user_prompt - INFO - Semantic search returned 5 results
368
+ Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 12.83it/s]
369
+ Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 51.94it/s]
370
+ Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 60.06it/s]
371
+ Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 65.59it/s]
372
+ Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 57.81it/s]
373
+ Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 61.78it/s]
374
+ Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 61.76it/s]
375
+ Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 61.14it/s]
376
+ 2025-07-31 07:14:57,296 - user_prompt - INFO - Inferred condition: None
377
+ 2025-07-31 07:14:57,296 - user_prompt - WARNING - Condition validation failed for: None
378
+ 2025-07-31 07:14:57,296 - user_prompt - INFO - No suitable condition found in semantic search
379
+ 2025-07-31 07:14:57,296 - llm_clients - INFO - Calling Medical LLM with query: idiopathic thrombocytopenic purpura management guidelines
380
+ 2025-07-31 07:15:06,621 - llm_clients - INFO - Raw LLM Response: The primary medical condition: Idiopathic Thrombocytopenic Purpura (ITP)
381
+
382
+ (As a medical assistant, I don't provide advice, but describe the condition and point to standard guidelines. For ITP management, refer to professional sources like the American Society of Hematology [ASH] or National Institutes of Health [NIH].)
383
+
384
+ Idiopathic Thrombocytopenic Purpura (ITP) is an autoimmune disorder characterized by low platelet count
385
+ 2025-07-31 07:15:06,621 - llm_clients - INFO - Query Latency: 9.3245 seconds
386
+ 2025-07-31 07:15:06,621 - llm_clients - INFO - Extracted Condition: The primary medical condition: Idiopathic Thrombocytopenic Purpura (ITP)
387
+ Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 11.12it/s]
388
+ 2025-07-31 07:15:07,391 - retrieval - INFO - Sliding window search: Found 5 results
389
+ ✅ Detected Level: 5
390
+ Condition: generic medical query
391
+ Emergency Keywords: medical|emergency
392
+ Treatment Keywords: treatment|management
393
+ Execution Time: 20.228s
394
+ 🎉 Test PASSED - Expected behavior achieved
395
+
396
+ 🔍 level4b_003: Level 4b→5: Rare emergency condition → generic search
397
+ Query: 'necrotizing fasciitis surgical intervention protocols'
398
+ Expected Level: 5
399
+ ----------------------------------------------------------------------
400
+ 🎯 Executing multilevel fallback...
401
+ 2025-07-31 07:15:07,398 - llm_clients - INFO - Calling Medical LLM with query: necrotizing fasciitis surgical intervention protocols
402
+ 2025-07-31 07:15:16,625 - llm_clients - INFO - Raw LLM Response: The primary medical condition: Necrotizing Fasciitis
403
+
404
+ In this context, the key condition is Necrotizing Fasciitis, a severe bacterial infection characterized by rapid destruction of subcutaneous tissue and fascia. The term provided, "surgical intervention protocols," refers to the treatment approach rather than a distinct medical condition. However, for clarity in this answer, I'll address it as it pertains to managing Necrotizing Fasciitis.
405
+
406
+ In Necrotizing Fasciitis, surgical
407
+ 2025-07-31 07:15:16,625 - llm_clients - INFO - Query Latency: 9.2271 seconds
408
+ 2025-07-31 07:15:16,625 - llm_clients - INFO - Extracted Condition: The primary medical condition: Necrotizing Fasciitis
409
+ 2025-07-31 07:15:16,625 - user_prompt - INFO - Starting semantic search fallback for query: 'necrotizing fasciitis surgical intervention protocols'
410
+ Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 10.01it/s]
411
+ 2025-07-31 07:15:17,212 - retrieval - INFO - Sliding window search: Found 5 results
412
+ 2025-07-31 07:15:17,222 - user_prompt - INFO - Semantic search returned 5 results
413
+ Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 12.01it/s]
414
+ Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 45.04it/s]
415
+ Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 44.57it/s]
416
+ Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 57.92it/s]
417
+ Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 48.15it/s]
418
+ Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 61.28it/s]
419
+ Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 60.83it/s]
420
+ Batches: 100%|███���██████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 61.38it/s]
421
+ 2025-07-31 07:15:17,449 - user_prompt - INFO - Inferred condition: None
422
+ 2025-07-31 07:15:17,449 - user_prompt - WARNING - Condition validation failed for: None
423
+ 2025-07-31 07:15:17,449 - user_prompt - INFO - No suitable condition found in semantic search
424
+ 2025-07-31 07:15:17,449 - llm_clients - INFO - Calling Medical LLM with query: necrotizing fasciitis surgical intervention protocols
425
+ 2025-07-31 07:15:24,511 - llm_clients - INFO - Raw LLM Response: The most representative condition: Necrotizing Fasciitis
426
+
427
+ (As a medical assistant, I do not provide advice, only identify conditions. For necrotizing fasciitis, surgical intervention typically involves aggressive debridement—removing dead tissue—and may require repeated procedures until healthy margins are achieved. This is accompanied by supportive care and antibiotics.)
428
+
429
+
430
+ 2025-07-31 07:15:24,511 - llm_clients - INFO - Query Latency: 7.0619 seconds
431
+ 2025-07-31 07:15:24,511 - llm_clients - INFO - Extracted Condition: The most representative condition: Necrotizing Fasciitis
432
+ Batches: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 13.83it/s]
433
+ 2025-07-31 07:15:25,078 - retrieval - INFO - Sliding window search: Found 5 results
434
+ ✅ Detected Level: 5
435
+ Condition: generic medical query
436
+ Emergency Keywords: medical|emergency
437
+ Treatment Keywords: treatment|management
438
+ Execution Time: 17.692s
439
+ 🎉 Test PASSED - Expected behavior achieved
440
+
441
+ ================================================================================
442
+ 📊 MULTILEVEL FALLBACK TEST REPORT
443
+ ================================================================================
444
+ 🕐 Execution Summary:
445
+ Total duration: 187.465s
446
+ Average per test: 14.420s
447
+
448
+ 📈 Test Results:
449
+ Total tests: 13
450
+ Passed: 7 ✅
451
+ Partial: 6 ⚠️
452
+ Failed: 6 ❌
453
+ Success rate: 53.8%
454
+
455
+ 🎯 Level Distribution Analysis:
456
+ Level 1 (Predefined Mapping): 4 tests, avg 5.752s
457
+ Level 5 (Generic Search): 9 tests, avg 17.495s
458
+
459
+ 📋 Category Analysis:
460
+ level1_predefined: 3/3 (100.0%)
461
+ level2_llm: 1/2 (50.0%)
462
+ level3_semantic: 0/2 (0.0%)
463
+ level4a_rejection: 0/3 (0.0%)
464
+ level4b_to_5: 3/3 (100.0%)
465
+
466
+ 📝 Detailed Test Results:
467
+
468
+ level1_001: ✅ PASS
469
+ Query: 'acute myocardial infarction treatment'
470
+ Expected Level: 1
471
+ Detected Level: 1
472
+ Condition: acute myocardial infarction
473
+ Time: 0.000s
474
+ Validation: ✅ Level 1 as expected. ✅ Condition 'acute myocardial infarction' matches expected.
475
+
476
+ level1_002: ✅ PASS
477
+ Query: 'how to manage acute stroke?'
478
+ Expected Level: 1
479
+ Detected Level: 1
480
+ Condition: acute stroke
481
+ Time: 0.000s
482
+ Validation: ✅ Level 1 as expected. ✅ Condition 'acute stroke' matches expected.
483
+
484
+ level1_003: ✅ PASS
485
+ Query: 'pulmonary embolism emergency protocol'
486
+ Expected Level: 1
487
+ Detected Level: 1
488
+ Condition: pulmonary embolism
489
+ Time: 0.000s
490
+ Validation: ✅ Level 1 as expected. ✅ Condition 'pulmonary embolism' matches expected.
491
+
492
+ level2_001: ✅ PASS
493
+ Query: 'patient with severe crushing chest pain radiating to left arm'
494
+ Expected Level: 2
495
+ Detected Level: 1
496
+ Condition: acute myocardial infarction
497
+ Time: 23.008s
498
+ Validation: ⚠️ Level 1 != expected 2. ✅ Condition 'acute myocardial infarction' matches expected.
499
+
500
+ level2_002: ⚠️ PARTIAL
501
+ Query: 'sudden onset weakness on right side with speech difficulty'
502
+ Expected Level: 2
503
+ Detected Level: 5
504
+ Condition: generic medical query
505
+ Time: 22.223s
506
+ Validation: ⚠️ Level 5 != expected 2. ⚠️ Condition 'generic medical query' != expected ['acute stroke', 'cerebrovascular accident'].
507
+
508
+ level3_001: ⚠️ PARTIAL
509
+ Query: 'emergency management of cardiovascular crisis'
510
+ Expected Level: 3
511
+ Detected Level: 5
512
+ Condition: generic medical query
513
+ Time: 11.647s
514
+ Validation: ⚠️ Level 5 != expected 3. ⚠️ Condition 'generic medical query' != expected [].
515
+
516
+ level3_002: ⚠️ PARTIAL
517
+ Query: 'urgent neurological intervention protocols'
518
+ Expected Level: 3
519
+ Detected Level: 5
520
+ Condition: generic medical query
521
+ Time: 10.398s
522
+ Validation: ⚠️ Level 5 != expected 3. ⚠️ Condition 'generic medical query' != expected [].
523
+
524
+ level4a_001: ⚠️ PARTIAL
525
+ Query: 'how to cook pasta properly?'
526
+ Expected Level: 4
527
+ Detected Level: 5
528
+ Condition: generic medical query
529
+ Time: 20.900s
530
+ Validation: ⚠️ Level 5 != expected 4. ⚠️ Query should have been rejected.
531
+
532
+ level4a_002: ⚠️ PARTIAL
533
+ Query: 'best programming language to learn in 2025'
534
+ Expected Level: 4
535
+ Detected Level: 5
536
+ Condition: generic medical query
537
+ Time: 22.107s
538
+ Validation: ⚠️ Level 5 != expected 4. ⚠️ Query should have been rejected.
539
+
540
+ level4a_003: ⚠️ PARTIAL
541
+ Query: 'weather forecast for tomorrow'
542
+ Expected Level: 4
543
+ Detected Level: 5
544
+ Condition: generic medical query
545
+ Time: 21.128s
546
+ Validation: ⚠️ Level 5 != expected 4. ⚠️ Query should have been rejected.
547
+
548
+ level4b_001: ✅ PASS
549
+ Query: 'rare hematologic malignancy treatment approaches'
550
+ Expected Level: 5
551
+ Detected Level: 5
552
+ Condition: generic medical query
553
+ Time: 11.137s
554
+ Validation: ✅ Level 5 as expected. ✅ Generic medical search triggered.
555
+
556
+ level4b_002: ✅ PASS
557
+ Query: 'idiopathic thrombocytopenic purpura management guidelines'
558
+ Expected Level: 5
559
+ Detected Level: 5
560
+ Condition: generic medical query
561
+ Time: 20.228s
562
+ Validation: ✅ Level 5 as expected. ✅ Generic medical search triggered.
563
+
564
+ level4b_003: ✅ PASS
565
+ Query: 'necrotizing fasciitis surgical intervention protocols'
566
+ Expected Level: 5
567
+ Detected Level: 5
568
+ Condition: generic medical query
569
+ Time: 17.692s
570
+ Validation: ✅ Level 5 as expected. ✅ Generic medical search triggered.
tests/result_of_test_multlevel_fallback_validation_revised.md ADDED
@@ -0,0 +1,534 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 🏥 OnCall.ai Multilevel Fallback Validation Test
2
+ ============================================================
3
+ 🔧 Initializing Components for Multilevel Fallback Test...
4
+ ------------------------------------------------------------
5
+ 1. Initializing Llama3-Med42-70B Client...
6
+ 2025-07-31 07:51:06,059 - llm_clients - INFO - Medical LLM client initialized with model: m42-health/Llama3-Med42-70B
7
+ 2025-07-31 07:51:06,059 - llm_clients - WARNING - Medical LLM Model: Research tool only. Not for professional medical diagnosis.
8
+ ✅ LLM client initialized
9
+ 2. Initializing Retrieval System...
10
+ 2025-07-31 07:51:06,059 - retrieval - INFO - Initializing retrieval system...
11
+ 2025-07-31 07:51:06,073 - sentence_transformers.SentenceTransformer - INFO - Use pytorch device_name: mps
12
+ 2025-07-31 07:51:06,073 - sentence_transformers.SentenceTransformer - INFO - Load pretrained SentenceTransformer: NeuML/pubmedbert-base-embeddings
13
+ 2025-07-31 07:51:09,264 - retrieval - INFO - Embedding model loaded successfully
14
+ 2025-07-31 07:51:10,711 - retrieval - INFO - Chunks loaded successfully
15
+ 2025-07-31 07:51:10,824 - retrieval - INFO - Embeddings loaded successfully
16
+ 2025-07-31 07:51:10,825 - retrieval - INFO - Loaded existing emergency index
17
+ 2025-07-31 07:51:10,826 - retrieval - INFO - Loaded existing treatment index
18
+ 2025-07-31 07:51:10,826 - retrieval - INFO - Retrieval system initialized successfully
19
+ ✅ Retrieval system initialized
20
+ 3. Initializing User Prompt Processor...
21
+ 2025-07-31 07:51:10,826 - sentence_transformers.SentenceTransformer - INFO - Use pytorch device_name: mps
22
+ 2025-07-31 07:51:10,826 - sentence_transformers.SentenceTransformer - INFO - Load pretrained SentenceTransformer: NeuML/pubmedbert-base-embeddings
23
+ 2025-07-31 07:51:12,702 - user_prompt - INFO - UserPromptProcessor initialized
24
+ ✅ User prompt processor initialized
25
+
26
+ 🎉 All components initialized successfully!
27
+
28
+ 🚀 Starting Multilevel Fallback Test Suite
29
+ Total test cases: 13
30
+ Test started at: 2025-07-31 07:51:06
31
+ ================================================================================
32
+
33
+ 🔍 level1_001: Level 1: Direct predefined condition match
34
+ Query: 'acute myocardial infarction treatment'
35
+ Expected Level: 1
36
+ ----------------------------------------------------------------------
37
+ 🎯 Executing multilevel fallback...
38
+ 2025-07-31 07:51:12,702 - user_prompt - INFO - Matched predefined condition: acute myocardial infarction
39
+ ✅ Detected Level: 1
40
+ Condition: acute myocardial infarction
41
+ Emergency Keywords: MI|chest pain|cardiac arrest
42
+ Treatment Keywords: aspirin|nitroglycerin|thrombolytic|PCI
43
+ Execution Time: 0.000s
44
+ 🎉 Test PASSED - Expected behavior achieved
45
+
46
+ 🔍 level1_002: Level 1: Predefined stroke condition
47
+ Query: 'how to manage acute stroke?'
48
+ Expected Level: 1
49
+ ----------------------------------------------------------------------
50
+ 🎯 Executing multilevel fallback...
51
+ 2025-07-31 07:51:12,702 - user_prompt - INFO - Matched predefined condition: acute stroke
52
+ ✅ Detected Level: 1
53
+ Condition: acute stroke
54
+ Emergency Keywords: stroke|neurological deficit|sudden weakness
55
+ Treatment Keywords: tPA|thrombolysis|stroke unit care
56
+ Execution Time: 0.000s
57
+ 🎉 Test PASSED - Expected behavior achieved
58
+
59
+ 🔍 level1_003: Level 1: Predefined PE condition
60
+ Query: 'pulmonary embolism emergency protocol'
61
+ Expected Level: 1
62
+ ----------------------------------------------------------------------
63
+ 🎯 Executing multilevel fallback...
64
+ 2025-07-31 07:51:12,702 - user_prompt - INFO - Matched predefined condition: pulmonary embolism
65
+ ✅ Detected Level: 1
66
+ Condition: pulmonary embolism
67
+ Emergency Keywords: chest pain|shortness of breath|sudden dyspnea
68
+ Treatment Keywords: anticoagulation|heparin|embolectomy
69
+ Execution Time: 0.000s
70
+ 🎉 Test PASSED - Expected behavior achieved
71
+
72
+ 🔍 level2_001: Level 2: Symptom-based query requiring LLM analysis
73
+ Query: 'patient with severe crushing chest pain radiating to left arm'
74
+ Expected Level: 2
75
+ ----------------------------------------------------------------------
76
+ 🎯 Executing multilevel fallback...
77
+ 2025-07-31 07:51:12,702 - llm_clients - INFO - Calling Medical LLM with query: patient with severe crushing chest pain radiating to left arm
78
+ 2025-07-31 07:51:55,277 - llm_clients - INFO - Raw LLM Response: Medical: "Acute Myocardial Infarction" (Heart Attack)
79
+ Explanation: The described symptoms of severe crushing chest pain radiating to the left arm are highly indicative of an acute myocardial infarction, commonly known as a heart attack. This is a medical emergency caused by blockage of coronary arteries, disrupting blood supply to the heart muscle.
80
+
81
+ (Not providing advice, just categorizing the condition)
82
+ 2025-07-31 07:51:55,278 - llm_clients - INFO - Query Latency: 42.5747 seconds
83
+ 2025-07-31 07:51:55,278 - llm_clients - INFO - Extracted Condition: acute myocardial infarction
84
+ ✅ Detected Level: 1
85
+ Condition: acute myocardial infarction
86
+ Emergency Keywords: MI|chest pain|cardiac arrest
87
+ Treatment Keywords: aspirin|nitroglycerin|thrombolytic|PCI
88
+ Execution Time: 42.576s
89
+ 🎉 Test PASSED - Expected behavior achieved
90
+
91
+ 🔍 level2_002: Level 2: Neurological symptoms requiring LLM
92
+ Query: 'sudden onset weakness on right side with speech difficulty'
93
+ Expected Level: 2
94
+ ----------------------------------------------------------------------
95
+ 🎯 Executing multilevel fallback...
96
+ 2025-07-31 07:51:55,279 - llm_clients - INFO - Calling Medical LLM with query: sudden onset weakness on right side with speech difficulty
97
+ 2025-07-31 07:52:06,165 - llm_clients - INFO - Raw LLM Response: Medical: "Acute Ischemic Stroke" (or Cerebrovascular Accident, specifically involving right hemispheric damage causing contralateral weakness and speech impairment)
98
+
99
+ Explanation: The symptoms described - sudden onset weakness on the right side (implying left brain hemisphere involvement due to contralateral motor control) and speech difficulty - are classic indicators of an acute ischemic stroke. This condition occurs when blood flow to a region of the brain is blocked, depriving it of oxygen and nutrients,
100
+ 2025-07-31 07:52:06,165 - llm_clients - INFO - Query Latency: 10.8864 seconds
101
+ 2025-07-31 07:52:06,165 - llm_clients - INFO - Extracted Condition: Medical: "Acute Ischemic Stroke" (or Cerebrovascular Accident, specifically involving right hemispheric damage causing contralateral weakness and speech impairment)
102
+ 2025-07-31 07:52:06,166 - user_prompt - INFO - Starting semantic search fallback for query: 'sudden onset weakness on right side with speech difficulty'
103
+ Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 1.61it/s]
104
+ 2025-07-31 07:52:07,568 - retrieval - INFO - Sliding window search: Found 5 results
105
+ 2025-07-31 07:52:07,575 - user_prompt - INFO - Semantic search returned 5 results
106
+ Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 14.70it/s]
107
+ Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 17.71it/s]
108
+ Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 17.64it/s]
109
+ Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 61.46it/s]
110
+ Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 16.59it/s]
111
+ Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 57.61it/s]
112
+ Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 57.26it/s]
113
+ Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 56.86it/s]
114
+ 2025-07-31 07:52:07,896 - user_prompt - INFO - Inferred condition: None
115
+ 2025-07-31 07:52:07,896 - user_prompt - WARNING - Condition validation failed for: None
116
+ 2025-07-31 07:52:07,896 - user_prompt - INFO - No suitable condition found in semantic search
117
+ 2025-07-31 07:52:07,897 - llm_clients - INFO - Calling Medical LLM with query: sudden onset weakness on right side with speech difficulty
118
+ 2025-07-31 07:52:16,923 - llm_clients - INFO - Raw LLM Response: Medical: "Cerebrovascular Accident (CVA) - Ischemic Stroke" (or simply "Ischemic Stroke" for brevity, as it's the most specific diagnosis here)
119
+ - Explanation: The symptoms described, sudden right-sided weakness and speech difficulty, are classic indicators of an ischemic stroke, which occurs when blood flow to the brain is blocked by a clot or narrowed blood vessels.
120
+
121
+ Note: While hemorrhagic stroke is another type of CVA, the given symptoms
122
+ 2025-07-31 07:52:16,923 - llm_clients - INFO - Query Latency: 9.0264 seconds
123
+ 2025-07-31 07:52:16,923 - llm_clients - INFO - Extracted Condition: Medical: "Cerebrovascular Accident (CVA) - Ischemic Stroke" (or simply "Ischemic Stroke" for brevity, as it's the most specific diagnosis here)
124
+ Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 1.89it/s]
125
+ 2025-07-31 07:52:17,964 - retrieval - INFO - Sliding window search: Found 5 results
126
+ ✅ Detected Level: 5
127
+ Condition: generic medical query
128
+ Emergency Keywords: medical|emergency
129
+ Treatment Keywords: treatment|management
130
+ Execution Time: 22.751s
131
+ ⚠️ Test PARTIAL - ⚠️ Level 5 != expected 2. ⚠️ Condition 'generic medical query' != expected ['acute stroke', 'cerebrovascular accident'].
132
+
133
+ 🔍 level3_001: Level 3: Generic medical terms requiring semantic search
134
+ Query: 'emergency management of cardiovascular crisis'
135
+ Expected Level: 3
136
+ ----------------------------------------------------------------------
137
+ 🎯 Executing multilevel fallback...
138
+ 2025-07-31 07:52:18,030 - llm_clients - INFO - Calling Medical LLM with query: emergency management of cardiovascular crisis
139
+ 2025-07-31 07:52:27,145 - llm_clients - INFO - Raw LLM Response: Medical: "Cardiovascular crisis" in this context (emergency management) is best represented by "Acute Myocardial Infarction (AMI)" or "ST-Elevation Myocardial Infarction (STEMI)," as both terms describe severe, time-critical cardiac events requiring immediate intervention. However, if considering a broader "cardiovascular crisis" that's not limited to infarction, "Cardiogenic Shock" might also be applicable, as it represents a severe, life
140
+ 2025-07-31 07:52:27,145 - llm_clients - INFO - Query Latency: 9.1143 seconds
141
+ 2025-07-31 07:52:27,145 - llm_clients - INFO - Extracted Condition: acute myocardial infarction
142
+ ✅ Detected Level: 1
143
+ Condition: acute myocardial infarction
144
+ Emergency Keywords: MI|chest pain|cardiac arrest
145
+ Treatment Keywords: aspirin|nitroglycerin|thrombolytic|PCI
146
+ Execution Time: 9.115s
147
+ ⚠️ Test PARTIAL - ⚠️ Level 1 != expected 3. ⚠️ Condition 'acute myocardial infarction' != expected [].
148
+
149
+ 🔍 level3_002: Level 3: Medical terminology requiring semantic fallback
150
+ Query: 'urgent neurological intervention protocols'
151
+ Expected Level: 3
152
+ ----------------------------------------------------------------------
153
+ 🎯 Executing multilevel fallback...
154
+ 2025-07-31 07:52:27,145 - llm_clients - INFO - Calling Medical LLM with query: urgent neurological intervention protocols
155
+ 2025-07-31 07:52:37,615 - llm_clients - INFO - Raw LLM Response: Medical: "Acute Ischemic Stroke" (representing a condition requiring urgent neurological intervention, specifically thrombectomy or thrombolysis protocols)
156
+
157
+ Explanation: Acute ischemic stroke necessitates rapid medical response, as timely interventions like mechanical thrombectomy or intravenous thrombolysis can significantly improve patient outcomes. The term "urgent neurological intervention protocols" in this context likely refers to these treatments for stroke, making "Acute Ischemic Stroke" the most representative medical condition.
158
+ 2025-07-31 07:52:37,615 - llm_clients - INFO - Query Latency: 10.4695 seconds
159
+ 2025-07-31 07:52:37,615 - llm_clients - INFO - Extracted Condition: Medical: "Acute Ischemic Stroke" (representing a condition requiring urgent neurological intervention, specifically thrombectomy or thrombolysis protocols)
160
+ 2025-07-31 07:52:37,616 - user_prompt - INFO - Starting semantic search fallback for query: 'urgent neurological intervention protocols'
161
+ Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 3.45it/s]
162
+ 2025-07-31 07:52:38,539 - retrieval - INFO - Sliding window search: Found 5 results
163
+ 2025-07-31 07:52:38,549 - user_prompt - INFO - Semantic search returned 5 results
164
+ Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 13.55it/s]
165
+ Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 49.53it/s]
166
+ Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 52.73it/s]
167
+ Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 64.13it/s]
168
+ Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 51.36it/s]
169
+ Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 61.40it/s]
170
+ Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 58.10it/s]
171
+ Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 62.29it/s]
172
+ 2025-07-31 07:52:38,759 - user_prompt - INFO - Inferred condition: None
173
+ 2025-07-31 07:52:38,759 - user_prompt - WARNING - Condition validation failed for: None
174
+ 2025-07-31 07:52:38,759 - user_prompt - INFO - No suitable condition found in semantic search
175
+ Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 16.09it/s]
176
+ 2025-07-31 07:52:39,345 - retrieval - INFO - Sliding window search: Found 5 results
177
+ ✅ Detected Level: 5
178
+ Condition: generic medical query
179
+ Emergency Keywords: medical|emergency
180
+ Treatment Keywords: treatment|management
181
+ Execution Time: 12.249s
182
+ ⚠️ Test PARTIAL - ⚠️ Level 5 != expected 3. ⚠️ Condition 'generic medical query' != expected [].
183
+
184
+ 🔍 level4a_001: Level 4a: Non-medical query should be rejected
185
+ Query: 'how to cook pasta properly?'
186
+ Expected Level: 4
187
+ ----------------------------------------------------------------------
188
+ 🎯 Executing multilevel fallback...
189
+ 2025-07-31 07:52:39,395 - llm_clients - INFO - Calling Medical LLM with query: how to cook pasta properly?
190
+ 2025-07-31 07:52:45,753 - llm_clients - INFO - Raw LLM Response: NON_MEDICAL_QUERY. This inquiry is about culinary technique (cooking pasta) and not related to medical conditions or health issues. It does not represent a medical topic for diagnosis or advice. Instead, it's a question of food preparation, typically addressed in cookbooks or culinary resources.
191
+ 2025-07-31 07:52:45,753 - llm_clients - INFO - Query Latency: 6.3575 seconds
192
+ 2025-07-31 07:52:45,753 - llm_clients - INFO - Extracted Condition:
193
+ 2025-07-31 07:52:45,753 - user_prompt - INFO - Starting semantic search fallback for query: 'how to cook pasta properly?'
194
+ Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 1.47it/s]
195
+ 2025-07-31 07:52:47,084 - retrieval - INFO - Sliding window search: Found 5 results
196
+ 2025-07-31 07:52:47,091 - user_prompt - INFO - Semantic search returned 5 results
197
+ Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 14.54it/s]
198
+ Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 46.74it/s]
199
+ Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 46.14it/s]
200
+ Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 57.37it/s]
201
+ Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 43.26it/s]
202
+ Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 60.53it/s]
203
+ Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 58.35it/s]
204
+ Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 60.17it/s]
205
+ 2025-07-31 07:52:47,305 - user_prompt - INFO - Inferred condition: None
206
+ 2025-07-31 07:52:47,305 - user_prompt - WARNING - Condition validation failed for: None
207
+ 2025-07-31 07:52:47,305 - user_prompt - INFO - No suitable condition found in semantic search
208
+ 2025-07-31 07:52:47,305 - llm_clients - INFO - Calling Medical LLM with query: how to cook pasta properly?
209
+ 2025-07-31 07:52:53,999 - llm_clients - INFO - Raw LLM Response: NON_MEDICAL_QUERY. This inquiry is about culinary technique (cooking pasta) and not related to any medical condition or health issue. It involves instructions on food preparation rather than addressing a disease, symptom, or medical concern.
210
+ 2025-07-31 07:52:53,999 - llm_clients - INFO - Query Latency: 6.6933 seconds
211
+ 2025-07-31 07:52:53,999 - llm_clients - INFO - Extracted Condition:
212
+ ✅ Detected Level: 4
213
+ Condition: None
214
+ Emergency Keywords: None
215
+ Treatment Keywords: None
216
+ Execution Time: 14.604s
217
+ 🎉 Test PASSED - Expected behavior achieved
218
+
219
+ 🔍 level4a_002: Level 4a: Technology query should be rejected
220
+ Query: 'best programming language to learn in 2025'
221
+ Expected Level: 4
222
+ ----------------------------------------------------------------------
223
+ 🎯 Executing multilevel fallback...
224
+ 2025-07-31 07:52:54,000 - llm_clients - INFO - Calling Medical LLM with query: best programming language to learn in 2025
225
+ 2025-07-31 07:53:00,100 - llm_clients - INFO - Raw LLM Response: NON_MEDICAL_QUERY. This inquiry is about technology (specifically, programming languages) and their future relevance, rather than a medical condition or health topic. It doesn't pertain to diagnosis, treatment, or any medical aspect. Therefore, it's not a medical query.
226
+ 2025-07-31 07:53:00,100 - llm_clients - INFO - Query Latency: 6.1004 seconds
227
+ 2025-07-31 07:53:00,100 - llm_clients - INFO - Extracted Condition:
228
+ 2025-07-31 07:53:00,100 - user_prompt - INFO - Starting semantic search fallback for query: 'best programming language to learn in 2025'
229
+ Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 2.94it/s]
230
+ 2025-07-31 07:53:00,968 - retrieval - INFO - Sliding window search: Found 5 results
231
+ 2025-07-31 07:53:01,048 - user_prompt - INFO - Semantic search returned 5 results
232
+ Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 13.26it/s]
233
+ Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 56.37it/s]
234
+ Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 56.33it/s]
235
+ Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 61.62it/s]
236
+ Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 56.23it/s]
237
+ Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 58.59it/s]
238
+ Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 58.53it/s]
239
+ Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 59.69it/s]
240
+ 2025-07-31 07:53:01,255 - user_prompt - INFO - Inferred condition: None
241
+ 2025-07-31 07:53:01,255 - user_prompt - WARNING - Condition validation failed for: None
242
+ 2025-07-31 07:53:01,255 - user_prompt - INFO - No suitable condition found in semantic search
243
+ 2025-07-31 07:53:01,256 - llm_clients - INFO - Calling Medical LLM with query: best programming language to learn in 2025
244
+ 2025-07-31 07:53:06,397 - llm_clients - INFO - Raw LLM Response: NON_MEDICAL_QUERY. This inquiry is about selecting a programming language for future learning (in 2025) and has no relation to medical conditions or healthcare. It falls under the domain of computer science and technology education.
245
+ 2025-07-31 07:53:06,397 - llm_clients - INFO - Query Latency: 5.1410 seconds
246
+ 2025-07-31 07:53:06,397 - llm_clients - INFO - Extracted Condition:
247
+ ✅ Detected Level: 4
248
+ Condition: None
249
+ Emergency Keywords: None
250
+ Treatment Keywords: None
251
+ Execution Time: 12.397s
252
+ 🎉 Test PASSED - Expected behavior achieved
253
+
254
+ 🔍 level4a_003: Level 4a: Weather query should be rejected
255
+ Query: 'weather forecast for tomorrow'
256
+ Expected Level: 4
257
+ ----------------------------------------------------------------------
258
+ 🎯 Executing multilevel fallback...
259
+ 2025-07-31 07:53:06,397 - llm_clients - INFO - Calling Medical LLM with query: weather forecast for tomorrow
260
+ 2025-07-31 07:53:11,119 - llm_clients - INFO - Raw LLM Response: NON_MEDICAL_QUERY. This inquiry is about meteorological information (weather prediction) and not related to any medical condition or health topic. It falls under environmental or general information, not medicine.
261
+ 2025-07-31 07:53:11,120 - llm_clients - INFO - Query Latency: 4.7219 seconds
262
+ 2025-07-31 07:53:11,120 - llm_clients - INFO - Extracted Condition:
263
+ 2025-07-31 07:53:11,120 - user_prompt - INFO - Starting semantic search fallback for query: 'weather forecast for tomorrow'
264
+ Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 2.01it/s]
265
+ 2025-07-31 07:53:12,200 - retrieval - INFO - Sliding window search: Found 5 results
266
+ 2025-07-31 07:53:12,209 - user_prompt - INFO - Semantic search returned 5 results
267
+ Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 13.36it/s]
268
+ Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 51.03it/s]
269
+ Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 53.14it/s]
270
+ Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 63.88it/s]
271
+ Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 55.51it/s]
272
+ Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 61.11it/s]
273
+ Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 62.37it/s]
274
+ Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 65.13it/s]
275
+ 2025-07-31 07:53:12,415 - user_prompt - INFO - Inferred condition: None
276
+ 2025-07-31 07:53:12,415 - user_prompt - WARNING - Condition validation failed for: None
277
+ 2025-07-31 07:53:12,415 - user_prompt - INFO - No suitable condition found in semantic search
278
+ 2025-07-31 07:53:12,415 - llm_clients - INFO - Calling Medical LLM with query: weather forecast for tomorrow
279
+ 2025-07-31 07:53:17,281 - llm_clients - INFO - Raw LLM Response: NON_MEDICAL_QUERY. This inquiry is about meteorology (predicting weather conditions) and not related to medical conditions or health issues. It doesn't involve symptoms, diagnoses, or any aspect of healthcare.
280
+ 2025-07-31 07:53:17,281 - llm_clients - INFO - Query Latency: 4.8653 seconds
281
+ 2025-07-31 07:53:17,281 - llm_clients - INFO - Extracted Condition:
282
+ ✅ Detected Level: 4
283
+ Condition: None
284
+ Emergency Keywords: None
285
+ Treatment Keywords: None
286
+ Execution Time: 10.884s
287
+ 🎉 Test PASSED - Expected behavior achieved
288
+
289
+ 🔍 level4b_001: Level 4b→5: Obscure medical query passing validation to generic search
290
+ Query: 'rare hematologic malignancy treatment approaches'
291
+ Expected Level: 5
292
+ ----------------------------------------------------------------------
293
+ 🎯 Executing multilevel fallback...
294
+ 2025-07-31 07:53:17,282 - llm_clients - INFO - Calling Medical LLM with query: rare hematologic malignancy treatment approaches
295
+ 2025-07-31 07:53:26,329 - llm_clients - INFO - Raw LLM Response: Medical: "rare hematologic malignancy treatment approaches" → "Targeted Therapy for Agnogenic Myeloid Metaplasia (or currently, 'Agnogenic/Idiopathic: Myelofibrosis' in modern classification, as part of CMML-excluded rare myeloproliferative neoplasms) or, alternatively, 'Chimeric Antigen Receptor T-Cell therapy (CAR-T) for rare B-cell lymphomas like Primary Mediastinal
296
+ 2025-07-31 07:53:26,329 - llm_clients - INFO - Query Latency: 9.0470 seconds
297
+ 2025-07-31 07:53:26,331 - llm_clients - INFO - Extracted Condition: Medical: "rare hematologic malignancy treatment approaches" → "Targeted Therapy for Agnogenic Myeloid Metaplasia (or currently, 'Agnogenic/Idiopathic: Myelofibrosis' in modern classification, as part of CMML-excluded rare myeloproliferative neoplasms) or, alternatively, 'Chimeric Antigen Receptor T-Cell therapy (CAR-T) for rare B-cell lymphomas like Primary Mediastinal
298
+ 2025-07-31 07:53:26,331 - user_prompt - INFO - Starting semantic search fallback for query: 'rare hematologic malignancy treatment approaches'
299
+ Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 11.05it/s]
300
+ 2025-07-31 07:53:26,871 - retrieval - INFO - Sliding window search: Found 5 results
301
+ 2025-07-31 07:53:26,880 - user_prompt - INFO - Semantic search returned 5 results
302
+ Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 12.32it/s]
303
+ Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 57.77it/s]
304
+ Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 60.97it/s]
305
+ Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 62.97it/s]
306
+ Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 59.97it/s]
307
+ Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 58.87it/s]
308
+ Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 58.11it/s]
309
+ Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 62.43it/s]
310
+ 2025-07-31 07:53:27,089 - user_prompt - INFO - Inferred condition: None
311
+ 2025-07-31 07:53:27,089 - user_prompt - WARNING - Condition validation failed for: None
312
+ 2025-07-31 07:53:27,089 - user_prompt - INFO - No suitable condition found in semantic search
313
+ Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 44.43it/s]
314
+ 2025-07-31 07:53:27,626 - retrieval - INFO - Sliding window search: Found 5 results
315
+ ✅ Detected Level: 5
316
+ Condition: generic medical query
317
+ Emergency Keywords: medical|emergency
318
+ Treatment Keywords: treatment|management
319
+ Execution Time: 10.356s
320
+ 🎉 Test PASSED - Expected behavior achieved
321
+
322
+ 🔍 level4b_002: Level 4b→5: Rare condition requiring generic medical search
323
+ Query: 'idiopathic thrombocytopenic purpura management guidelines'
324
+ Expected Level: 5
325
+ ----------------------------------------------------------------------
326
+ 🎯 Executing multilevel fallback...
327
+ 2025-07-31 07:53:27,638 - llm_clients - INFO - Calling Medical LLM with query: idiopathic thrombocytopenic purpura management guidelines
328
+ 2025-07-31 07:53:36,704 - llm_clients - INFO - Raw LLM Response: Medical: "Idiopathic Thrombocytopenic Purpura (ITP) Management" → "ITP Treatment Protocols" (referring to guidelines for managing this autoimmune platelet disorder, which include corticosteroids, IVIG, thrombopoietin receptor agonists, or splenectomy in certain cases)
329
+
330
+ Explanation: This query is medical because it pertains to the guidelines for treating a specific blood disorder, Idiopathic Thrombocytopenic
331
+ 2025-07-31 07:53:36,704 - llm_clients - INFO - Query Latency: 9.0658 seconds
332
+ 2025-07-31 07:53:36,704 - llm_clients - INFO - Extracted Condition: Medical: "Idiopathic Thrombocytopenic Purpura (ITP) Management" → "ITP Treatment Protocols" (referring to guidelines for managing this autoimmune platelet disorder, which include corticosteroids, IVIG, thrombopoietin receptor agonists, or splenectomy in certain cases)
333
+ 2025-07-31 07:53:36,704 - user_prompt - INFO - Starting semantic search fallback for query: 'idiopathic thrombocytopenic purpura management guidelines'
334
+ Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 10.61it/s]
335
+ 2025-07-31 07:53:37,450 - retrieval - INFO - Sliding window search: Found 5 results
336
+ 2025-07-31 07:53:37,459 - user_prompt - INFO - Semantic search returned 5 results
337
+ Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 11.72it/s]
338
+ Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 44.43it/s]
339
+ Batches: 100%|███████████████████████��███████████████████████████████████| 1/1 [00:00<00:00, 41.51it/s]
340
+ Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 60.38it/s]
341
+ Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 43.72it/s]
342
+ Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 59.74it/s]
343
+ Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 59.79it/s]
344
+ Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 60.22it/s]
345
+ 2025-07-31 07:53:37,691 - user_prompt - INFO - Inferred condition: None
346
+ 2025-07-31 07:53:37,691 - user_prompt - WARNING - Condition validation failed for: None
347
+ 2025-07-31 07:53:37,691 - user_prompt - INFO - No suitable condition found in semantic search
348
+ 2025-07-31 07:53:37,691 - llm_clients - INFO - Calling Medical LLM with query: idiopathic thrombocytopenic purpura management guidelines
349
+ 2025-07-31 07:53:47,836 - llm_clients - INFO - Raw LLM Response: Medical: "Idiopathic Thrombocytopenic Purpura (ITP) Management" → "ITP Treatment Protocols" (referring to guidelines for therapy in this autoimmune platelet disorder, which may include corticosteroids, intravenous immunoglobulin, or thrombopoietin receptor agonists, among other strategies).
350
+
351
+ Explanation: This query is medical because it discusses guidelines for managing a specific blood disorder, idiopathic thrombocytopenic purpura
352
+ 2025-07-31 07:53:47,836 - llm_clients - INFO - Query Latency: 10.1445 seconds
353
+ 2025-07-31 07:53:47,836 - llm_clients - INFO - Extracted Condition: Medical: "Idiopathic Thrombocytopenic Purpura (ITP) Management" → "ITP Treatment Protocols" (referring to guidelines for therapy in this autoimmune platelet disorder, which may include corticosteroids, intravenous immunoglobulin, or thrombopoietin receptor agonists, among other strategies).
354
+ Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 2.06it/s]
355
+ 2025-07-31 07:53:48,812 - retrieval - INFO - Sliding window search: Found 5 results
356
+ ✅ Detected Level: 5
357
+ Condition: generic medical query
358
+ Emergency Keywords: medical|emergency
359
+ Treatment Keywords: treatment|management
360
+ Execution Time: 21.183s
361
+ 🎉 Test PASSED - Expected behavior achieved
362
+
363
+ 🔍 level4b_003: Level 4b→5: Rare emergency condition → generic search
364
+ Query: 'necrotizing fasciitis surgical intervention protocols'
365
+ Expected Level: 5
366
+ ----------------------------------------------------------------------
367
+ 🎯 Executing multilevel fallback...
368
+ 2025-07-31 07:53:48,821 - llm_clients - INFO - Calling Medical LLM with query: necrotizing fasciitis surgical intervention protocols
369
+ 2025-07-31 07:53:57,799 - llm_clients - INFO - Raw LLM Response: Medical: "Necrotizing Fasciitis" - In this context, the primary medical condition is Necrotizing Fasciitis, a severe soft tissue infection characterized by rapid progression and tissue death. The phrase "surgical intervention protocols" refers to the medical procedures and guidelines for surgically managing this condition, typically involving debridement (removal of dead tissue) and sometimes amputation.
370
+
371
+ Explanation: This query is medical because it pertains to a specific infectious disease (Necrotizing
372
+ 2025-07-31 07:53:57,799 - llm_clients - INFO - Query Latency: 8.9777 seconds
373
+ 2025-07-31 07:53:57,800 - llm_clients - INFO - Extracted Condition: Medical: "Necrotizing Fasciitis" - In this context, the primary medical condition is Necrotizing Fasciitis, a severe soft tissue infection characterized by rapid progression and tissue death. The phrase "surgical intervention protocols" refers to the medical procedures and guidelines for surgically managing this condition, typically involving debridement (removal of dead tissue) and sometimes amputation.
374
+ 2025-07-31 07:53:57,800 - user_prompt - INFO - Starting semantic search fallback for query: 'necrotizing fasciitis surgical intervention protocols'
375
+ Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 13.58it/s]
376
+ 2025-07-31 07:53:58,405 - retrieval - INFO - Sliding window search: Found 5 results
377
+ 2025-07-31 07:53:58,414 - user_prompt - INFO - Semantic search returned 5 results
378
+ Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 11.81it/s]
379
+ Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 48.09it/s]
380
+ Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 47.49it/s]
381
+ Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 62.11it/s]
382
+ Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 46.81it/s]
383
+ Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 60.57it/s]
384
+ Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 59.03it/s]
385
+ Batches: 100%|███████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 62.29it/s]
386
+ 2025-07-31 07:53:58,638 - user_prompt - INFO - Inferred condition: None
387
+ 2025-07-31 07:53:58,638 - user_prompt - WARNING - Condition validation failed for: None
388
+ 2025-07-31 07:53:58,638 - user_prompt - INFO - No suitable condition found in semantic search
389
+ 2025-07-31 07:53:58,638 - llm_clients - INFO - Calling Medical LLM with query: necrotizing fasciitis surgical intervention protocols
390
+ 2025-07-31 07:53:58,758 - llm_clients - ERROR - Medical LLM query error: 402 Client Error: Payment Required for url: https://router.huggingface.co/featherless-ai/v1/chat/completions (Request ID: Root=1-688b8386-259e81a24556b80a163e3d17;5ec89b2d-e0da-4255-90b6-f0c7e9577b38)
391
+
392
+ You have exceeded your monthly included credits for Inference Providers. Subscribe to PRO to get 20x more monthly included credits.
393
+ 2025-07-31 07:53:58,758 - llm_clients - ERROR - Error Type: HfHubHTTPError
394
+ 2025-07-31 07:53:58,758 - llm_clients - ERROR - Detailed Error: HfHubHTTPError('402 Client Error: Payment Required for url: https://router.huggingface.co/featherless-ai/v1/chat/completions (Request ID: Root=1-688b8386-259e81a24556b80a163e3d17;5ec89b2d-e0da-4255-90b6-f0c7e9577b38)\n\nYou have exceeded your monthly included credits for Inference Providers. Subscribe to PRO to get 20x more monthly included credits.')
395
+ 2025-07-31 07:53:58,758 - llm_clients - ERROR - Query Latency (on error): 0.1196 seconds
396
+ 2025-07-31 07:53:58,758 - llm_clients - ERROR - Query that caused error: necrotizing fasciitis surgical intervention protocols
397
+ ✅ Detected Level: 4
398
+ Condition: None
399
+ Emergency Keywords: None
400
+ Treatment Keywords: None
401
+ Execution Time: 9.937s
402
+ ⚠️ Test PARTIAL - ⚠️ Level 4 != expected 5. ⚠️ Should trigger generic medical search.
403
+
404
+ ================================================================================
405
+ 📊 MULTILEVEL FALLBACK TEST REPORT
406
+ ================================================================================
407
+ 🕐 Execution Summary:
408
+ Total duration: 172.699s
409
+ Average per test: 13.285s
410
+
411
+ 📈 Test Results:
412
+ Total tests: 13
413
+ Passed: 9 ✅
414
+ Partial: 4 ⚠️
415
+ Failed: 4 ❌
416
+ Success rate: 69.2%
417
+
418
+ 🎯 Level Distribution Analysis:
419
+ Level 1 (Predefined Mapping): 5 tests, avg 10.338s
420
+ Level 4 (Validation Rejection): 4 tests, avg 11.956s
421
+ Level 5 (Generic Search): 4 tests, avg 16.635s
422
+
423
+ 📋 Category Analysis:
424
+ level1_predefined: 3/3 (100.0%)
425
+ level2_llm: 1/2 (50.0%)
426
+ level3_semantic: 0/2 (0.0%)
427
+ level4a_rejection: 3/3 (100.0%)
428
+ level4b_to_5: 2/3 (66.7%)
429
+
430
+ 📝 Detailed Test Results:
431
+
432
+ level1_001: ✅ PASS
433
+ Query: 'acute myocardial infarction treatment'
434
+ Expected Level: 1
435
+ Detected Level: 1
436
+ Condition: acute myocardial infarction
437
+ Time: 0.000s
438
+ Validation: ✅ Level 1 as expected. ✅ Condition 'acute myocardial infarction' matches expected.
439
+
440
+ level1_002: ✅ PASS
441
+ Query: 'how to manage acute stroke?'
442
+ Expected Level: 1
443
+ Detected Level: 1
444
+ Condition: acute stroke
445
+ Time: 0.000s
446
+ Validation: ✅ Level 1 as expected. ✅ Condition 'acute stroke' matches expected.
447
+
448
+ level1_003: ✅ PASS
449
+ Query: 'pulmonary embolism emergency protocol'
450
+ Expected Level: 1
451
+ Detected Level: 1
452
+ Condition: pulmonary embolism
453
+ Time: 0.000s
454
+ Validation: ✅ Level 1 as expected. ✅ Condition 'pulmonary embolism' matches expected.
455
+
456
+ level2_001: ✅ PASS
457
+ Query: 'patient with severe crushing chest pain radiating to left arm'
458
+ Expected Level: 2
459
+ Detected Level: 1
460
+ Condition: acute myocardial infarction
461
+ Time: 42.576s
462
+ Validation: ⚠️ Level 1 != expected 2. ✅ Condition 'acute myocardial infarction' matches expected.
463
+
464
+ level2_002: ⚠️ PARTIAL
465
+ Query: 'sudden onset weakness on right side with speech difficulty'
466
+ Expected Level: 2
467
+ Detected Level: 5
468
+ Condition: generic medical query
469
+ Time: 22.751s
470
+ Validation: ⚠️ Level 5 != expected 2. ⚠️ Condition 'generic medical query' != expected ['acute stroke', 'cerebrovascular accident'].
471
+
472
+ level3_001: ⚠️ PARTIAL
473
+ Query: 'emergency management of cardiovascular crisis'
474
+ Expected Level: 3
475
+ Detected Level: 1
476
+ Condition: acute myocardial infarction
477
+ Time: 9.115s
478
+ Validation: ⚠️ Level 1 != expected 3. ⚠️ Condition 'acute myocardial infarction' != expected [].
479
+
480
+ level3_002: ⚠️ PARTIAL
481
+ Query: 'urgent neurological intervention protocols'
482
+ Expected Level: 3
483
+ Detected Level: 5
484
+ Condition: generic medical query
485
+ Time: 12.249s
486
+ Validation: ⚠️ Level 5 != expected 3. ⚠️ Condition 'generic medical query' != expected [].
487
+
488
+ level4a_001: ✅ PASS
489
+ Query: 'how to cook pasta properly?'
490
+ Expected Level: 4
491
+ Detected Level: 4
492
+ Condition: None
493
+ Time: 14.604s
494
+ Validation: ✅ Level 4 as expected. ✅ Query correctly rejected.
495
+
496
+ level4a_002: ✅ PASS
497
+ Query: 'best programming language to learn in 2025'
498
+ Expected Level: 4
499
+ Detected Level: 4
500
+ Condition: None
501
+ Time: 12.397s
502
+ Validation: ✅ Level 4 as expected. ✅ Query correctly rejected.
503
+
504
+ level4a_003: ✅ PASS
505
+ Query: 'weather forecast for tomorrow'
506
+ Expected Level: 4
507
+ Detected Level: 4
508
+ Condition: None
509
+ Time: 10.884s
510
+ Validation: ✅ Level 4 as expected. ✅ Query correctly rejected.
511
+
512
+ level4b_001: ✅ PASS
513
+ Query: 'rare hematologic malignancy treatment approaches'
514
+ Expected Level: 5
515
+ Detected Level: 5
516
+ Condition: generic medical query
517
+ Time: 10.356s
518
+ Validation: ✅ Level 5 as expected. ✅ Generic medical search triggered.
519
+
520
+ level4b_002: ✅ PASS
521
+ Query: 'idiopathic thrombocytopenic purpura management guidelines'
522
+ Expected Level: 5
523
+ Detected Level: 5
524
+ Condition: generic medical query
525
+ Time: 21.183s
526
+ Validation: ✅ Level 5 as expected. ✅ Generic medical search triggered.
527
+
528
+ level4b_003: ⚠️ PARTIAL
529
+ Query: 'necrotizing fasciitis surgical intervention protocols'
530
+ Expected Level: 5
531
+ Detected Level: 4
532
+ Condition: None
533
+ Time: 9.937s
534
+ Validation: ⚠️ Level 4 != expected 5. ⚠️ Should trigger generic medical search.
tests/result_of_test_userinput_userprompt_medical_condition_llm.md ADDED
@@ -0,0 +1,381 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 🏥 OnCall.ai Medical Query Processing Pipeline Test
2
+ ============================================================
3
+ 🔧 Initializing Pipeline Components...
4
+ --------------------------------------------------
5
+ 1. Initializing Llama3-Med42-70B Client...
6
+ 2025-07-31 06:38:22,609 - llm_clients - INFO - Medical LLM client initialized with model: m42-health/Llama3-Med42-70B
7
+ 2025-07-31 06:38:22,609 - llm_clients - WARNING - Medical LLM Model: Research tool only. Not for professional medical diagnosis.
8
+ ✅ LLM client initialized successfully
9
+ 2. Initializing Retrieval System...
10
+ 2025-07-31 06:38:22,609 - retrieval - INFO - Initializing retrieval system...
11
+ 2025-07-31 06:38:22,621 - sentence_transformers.SentenceTransformer - INFO - Use pytorch device_name: mps
12
+ 2025-07-31 06:38:22,621 - sentence_transformers.SentenceTransformer - INFO - Load pretrained SentenceTransformer: NeuML/pubmedbert-base-embeddings
13
+ 2025-07-31 06:38:26,965 - retrieval - INFO - Embedding model loaded successfully
14
+ 2025-07-31 06:38:28,444 - retrieval - INFO - Chunks loaded successfully
15
+ 2025-07-31 06:38:28,532 - retrieval - INFO - Embeddings loaded successfully
16
+ 2025-07-31 06:38:28,533 - retrieval - INFO - Loaded existing emergency index
17
+ 2025-07-31 06:38:28,534 - retrieval - INFO - Loaded existing treatment index
18
+ 2025-07-31 06:38:28,534 - retrieval - INFO - Retrieval system initialized successfully
19
+ ✅ Retrieval system initialized successfully
20
+ 3. Initializing User Prompt Processor...
21
+ 2025-07-31 06:38:28,534 - sentence_transformers.SentenceTransformer - INFO - Use pytorch device_name: mps
22
+ 2025-07-31 06:38:28,534 - sentence_transformers.SentenceTransformer - INFO - Load pretrained SentenceTransformer: NeuML/pubmedbert-base-embeddings
23
+ 2025-07-31 06:38:30,716 - user_prompt - INFO - UserPromptProcessor initialized
24
+ ✅ User prompt processor initialized successfully
25
+
26
+ 🎉 All components initialized successfully!
27
+
28
+ 🚀 Starting Comprehensive Pipeline Test
29
+ Total test cases: 6
30
+ Test started at: 2025-07-31 06:38:22
31
+ ================================================================================
32
+
33
+ 🔍 test_001: Classic acute myocardial infarction query
34
+ Query: 'how to treat acute MI?'
35
+ ------------------------------------------------------------
36
+ Step 1: Extracting medical condition and keywords...
37
+ 2025-07-31 06:38:30,716 - llm_clients - INFO - Calling Medical LLM with query: how to treat acute MI?
38
+ 2025-07-31 06:39:12,449 - llm_clients - INFO - Raw LLM Response: The most representative condition: Acute Myocardial Infarction (AMI, or Heart Attack)
39
+
40
+ For treatment guidance: Acute myocardial infarction is managed by cardiologists and emergency medical teams, not medical assistants. However, for informational purposes, primary treatments include:
41
+ 1. Reperfusion therapy: This may involve fibrinolysis (clot-busting medications) or percutaneous coronary intervention (PCI, such as angioplasty and stenting).
42
+ 2. Antiplatelet therapy
43
+ 2025-07-31 06:39:12,450 - llm_clients - INFO - Query Latency: 41.7327 seconds
44
+ 2025-07-31 06:39:12,450 - llm_clients - INFO - Extracted Condition: acute myocardial infarction
45
+ Condition: acute myocardial infarction
46
+ Emergency keywords: MI|chest pain|cardiac arrest
47
+ Treatment keywords: aspirin|nitroglycerin|thrombolytic|PCI
48
+ Source: predefined_mapping
49
+ Duration: 41.734s
50
+
51
+ Step 2: User confirmation process...
52
+ Confirmation type: confirmation_needed
53
+
54
+ Step 3: Executing retrieval...
55
+ Batches: 100%|███████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 1.46it/s]
56
+ 2025-07-31 06:39:13,227 - retrieval - INFO - Search results: Emergency=5, Treatment=5
57
+ 2025-07-31 06:39:13,228 - retrieval - INFO - Deduplication: Processing 10 results using text matching
58
+ 2025-07-31 06:39:13,228 - retrieval - INFO - Deduplication summary: 10 → 9 results (removed 1)
59
+ Search query: 'MI|chest pain|cardiac arrest aspirin|nitroglycerin|thrombolytic|PCI'
60
+ Total results: 9
61
+ Emergency results: 4
62
+ Treatment results: 5
63
+ Duration: 0.778s
64
+
65
+ Top 3 results:
66
+ 1. Type: treatment, Distance: 0.6740
67
+ Text preview: ong term management abbreviations : ace : angiotensin converting enzyme ; arb : angiotensin receptor...
68
+ 2. Type: treatment, Distance: 0.6792
69
+ Text preview: on ; pci : percutaneous coronary intervention ; po : per os ; stemi : st elevation myocardial infarc...
70
+ 3. Type: treatment, Distance: 0.6904
71
+ Text preview: receptor blocker ; mi : myocardial infarction # do ' s - a pre - hospital ecg is recommended. if ste...
72
+
73
+ ✅ Test test_001 completed successfully (42.511s)
74
+
75
+ 🔍 test_002: Symptoms-based query requiring LLM analysis
76
+ Query: 'patient with severe chest pain and shortness of breath'
77
+ ------------------------------------------------------------
78
+ Step 1: Extracting medical condition and keywords...
79
+ 2025-07-31 06:39:13,228 - llm_clients - INFO - Calling Medical LLM with query: patient with severe chest pain and shortness of breath
80
+ 2025-07-31 06:39:31,525 - llm_clients - INFO - Raw LLM Response: Acute Coronary Syndrome (specifically, possible ST-Elevation Myocardial Infarction - STEMI, given severe chest pain, or non-STEMI/NST-Elevation Acute Coronary Syndrome if ST segments not elevated, based on ECG; shortness of breath indicates potential cardiac ischemia complication or concurrent pulmonary issue like cardiogenic pulmonary edema)
81
+
82
+ Note: This response is for informational purposes only and should not replace immediate medical evaluation and diagnosis by a licensed physician. The patient needs
83
+ 2025-07-31 06:39:31,525 - llm_clients - INFO - Query Latency: 18.2971 seconds
84
+ 2025-07-31 06:39:31,525 - llm_clients - INFO - Extracted Condition: Acute Coronary Syndrome (specifically, possible ST-Elevation Myocardial Infarction - STEMI, given severe chest pain, or non-STEMI/NST-Elevation Acute Coronary Syndrome if ST segments not elevated, based on ECG; shortness of breath indicates potential cardiac ischemia complication or concurrent pulmonary issue like cardiogenic pulmonary edema)
85
+ 2025-07-31 06:39:31,525 - user_prompt - INFO - Starting semantic search fallback for query: 'patient with severe chest pain and shortness of breath'
86
+ Batches: 100%|███████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 7.70it/s]
87
+ 2025-07-31 06:39:32,392 - retrieval - INFO - Sliding window search: Found 5 results
88
+ 2025-07-31 06:39:32,402 - user_prompt - INFO - Semantic search returned 5 results
89
+ Batches: 100%|███████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 13.86it/s]
90
+ Batches: 100%|███████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 17.53it/s]
91
+ Batches: 100%|███████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 17.22it/s]
92
+ Batches: 100%|███████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 57.51it/s]
93
+ Batches: 100%|███████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 17.23it/s]
94
+ Batches: 100%|███████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 58.05it/s]
95
+ Batches: 100%|███████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 59.09it/s]
96
+ Batches: 100%|███████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 60.88it/s]
97
+ 2025-07-31 06:39:32,729 - user_prompt - INFO - Inferred condition: None
98
+ 2025-07-31 06:39:32,729 - user_prompt - WARNING - Condition validation failed for: None
99
+ 2025-07-31 06:39:32,729 - user_prompt - INFO - No suitable condition found in semantic search
100
+ Batches: 100%|███████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 16.77it/s]
101
+ 2025-07-31 06:39:33,251 - retrieval - INFO - Sliding window search: Found 5 results
102
+ Condition: generic medical query
103
+ Emergency keywords: medical|emergency
104
+ Treatment keywords: treatment|management
105
+ Source: generic_search
106
+ Duration: 20.033s
107
+
108
+ Step 2: User confirmation process...
109
+ Confirmation type: confirmation_needed
110
+
111
+ Step 3: Executing retrieval...
112
+ Batches: 100%|███████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 16.28it/s]
113
+ 2025-07-31 06:39:33,404 - retrieval - INFO - Search results: Emergency=5, Treatment=5
114
+ 2025-07-31 06:39:33,404 - retrieval - INFO - Deduplication: Processing 10 results using text matching
115
+ 2025-07-31 06:39:33,404 - retrieval - INFO - Deduplication summary: 10 → 9 results (removed 1)
116
+ Search query: 'medical|emergency treatment|management'
117
+ Total results: 9
118
+ Emergency results: 5
119
+ Treatment results: 4
120
+ Duration: 0.143s
121
+
122
+ Top 3 results:
123
+ 1. Type: treatment, Distance: 0.7708
124
+ Text preview: and nurse practitioners who may or may not be formally trained in emergency medicine. they offer pri...
125
+ 2. Type: emergency, Distance: 0.8056
126
+ Text preview: organization of emergency medical assistance emergency medical assistance is the first aid that is g...
127
+ 3. Type: emergency, Distance: 0.8321
128
+ Text preview: ion to the emergency room ; - urgent situation that requires advanced medical care before transporta...
129
+
130
+ ✅ Test test_002 completed successfully (20.176s)
131
+
132
+ 🔍 test_003: Neurological emergency query
133
+ Query: 'sudden neurological symptoms suggesting stroke'
134
+ ------------------------------------------------------------
135
+ Step 1: Extracting medical condition and keywords...
136
+ 2025-07-31 06:39:33,404 - llm_clients - INFO - Calling Medical LLM with query: sudden neurological symptoms suggesting stroke
137
+ 2025-07-31 06:39:49,400 - llm_clients - INFO - Raw LLM Response: Cerebrovascular Accident (CVA), or Acute Ischemic Stroke
138
+
139
+ (As a medical assistant, I'm limited to providing condition labels, not advice. In this case, the description given—sudden neurological symptoms suggestive of stroke—points to an acute ischemic stroke, also known as cerebrovascular accident (CVA). This diagnosis implies a blockage of blood flow to the brain, resulting in sudden neurological deficits.)
140
+
141
+ **Please consult a qualified healthcare professional for evaluation and management.
142
+ 2025-07-31 06:39:49,403 - llm_clients - INFO - Query Latency: 15.9960 seconds
143
+ 2025-07-31 06:39:49,404 - llm_clients - INFO - Extracted Condition: Cerebrovascular Accident (CVA), or Acute Ischemic Stroke
144
+ 2025-07-31 06:39:49,405 - user_prompt - INFO - Starting semantic search fallback for query: 'sudden neurological symptoms suggesting stroke'
145
+ Batches: 100%|███████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 8.53it/s]
146
+ 2025-07-31 06:39:50,205 - retrieval - INFO - Sliding window search: Found 5 results
147
+ 2025-07-31 06:39:50,214 - user_prompt - INFO - Semantic search returned 5 results
148
+ Batches: 100%|███████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 13.55it/s]
149
+ Batches: 100%|███████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 55.19it/s]
150
+ Batches: 100%|███████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 55.05it/s]
151
+ Batches: 100%|███████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 62.50it/s]
152
+ Batches: 100%|███████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 61.67it/s]
153
+ Batches: 100%|███████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 62.14it/s]
154
+ Batches: 100%|███████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 59.27it/s]
155
+ Batches: 100%|███████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 59.62it/s]
156
+ 2025-07-31 06:39:50,417 - user_prompt - INFO - Inferred condition: None
157
+ 2025-07-31 06:39:50,418 - user_prompt - WARNING - Condition validation failed for: None
158
+ 2025-07-31 06:39:50,418 - user_prompt - INFO - No suitable condition found in semantic search
159
+ Batches: 100%|███████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 15.16it/s]
160
+ 2025-07-31 06:39:50,938 - retrieval - INFO - Sliding window search: Found 5 results
161
+ Condition: generic medical query
162
+ Emergency keywords: medical|emergency
163
+ Treatment keywords: treatment|management
164
+ Source: generic_search
165
+ Duration: 17.544s
166
+
167
+ Step 2: User confirmation process...
168
+ Confirmation type: confirmation_needed
169
+
170
+ Step 3: Executing retrieval...
171
+ Batches: 100%|███████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 46.02it/s]
172
+ 2025-07-31 06:39:50,972 - retrieval - INFO - Search results: Emergency=5, Treatment=5
173
+ 2025-07-31 06:39:50,972 - retrieval - INFO - Deduplication: Processing 10 results using text matching
174
+ 2025-07-31 06:39:50,972 - retrieval - INFO - Deduplication summary: 10 → 9 results (removed 1)
175
+ Search query: 'medical|emergency treatment|management'
176
+ Total results: 9
177
+ Emergency results: 5
178
+ Treatment results: 4
179
+ Duration: 0.025s
180
+
181
+ Top 3 results:
182
+ 1. Type: treatment, Distance: 0.7708
183
+ Text preview: and nurse practitioners who may or may not be formally trained in emergency medicine. they offer pri...
184
+ 2. Type: emergency, Distance: 0.8056
185
+ Text preview: organization of emergency medical assistance emergency medical assistance is the first aid that is g...
186
+ 3. Type: emergency, Distance: 0.8321
187
+ Text preview: ion to the emergency room ; - urgent situation that requires advanced medical care before transporta...
188
+
189
+ ✅ Test test_003 completed successfully (17.569s)
190
+
191
+ 🔍 test_004: Protocol-specific stroke query
192
+ Query: 'acute stroke management protocol'
193
+ ------------------------------------------------------------
194
+ Step 1: Extracting medical condition and keywords...
195
+ 2025-07-31 06:39:50,973 - user_prompt - INFO - Matched predefined condition: acute stroke
196
+ Condition: acute stroke
197
+ Emergency keywords: stroke|neurological deficit|sudden weakness
198
+ Treatment keywords: tPA|thrombolysis|stroke unit care
199
+ Source: predefined_mapping
200
+ Duration: 0.000s
201
+
202
+ Step 2: User confirmation process...
203
+ Confirmation type: confirmation_needed
204
+
205
+ Step 3: Executing retrieval...
206
+ Batches: 100%|███████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 15.92it/s]
207
+ 2025-07-31 06:39:51,110 - retrieval - INFO - Search results: Emergency=5, Treatment=5
208
+ 2025-07-31 06:39:51,110 - retrieval - INFO - Deduplication: Processing 10 results using text matching
209
+ 2025-07-31 06:39:51,110 - retrieval - INFO - Deduplication summary: 10 → 9 results (removed 1)
210
+ Search query: 'stroke|neurological deficit|sudden weakness tPA|thrombolysis|stroke unit care'
211
+ Total results: 9
212
+ Emergency results: 5
213
+ Treatment results: 4
214
+ Duration: 0.137s
215
+
216
+ Top 3 results:
217
+ 1. Type: treatment, Distance: 0.7389
218
+ Text preview: hree hours of the onset of stroke. early treatment ( within 90 minutes ) may be more likely to resul...
219
+ 2. Type: treatment, Distance: 0.7401
220
+ Text preview: hree hours of the onset of stroke. early treatment ( within 90 minutes ) may be more likely to resul...
221
+ 3. Type: emergency, Distance: 0.7685
222
+ Text preview: mproved outcomes for a broad spectrum of carefully selected clients who can be treated within three ...
223
+
224
+ ✅ Test test_004 completed successfully (0.137s)
225
+
226
+ 🔍 test_005: General symptom requiring LLM analysis
227
+ Query: 'patient presenting with acute abdominal pain'
228
+ ------------------------------------------------------------
229
+ Step 1: Extracting medical condition and keywords...
230
+ 2025-07-31 06:39:51,110 - llm_clients - INFO - Calling Medical LLM with query: patient presenting with acute abdominal pain
231
+ 2025-07-31 06:40:00,096 - llm_clients - INFO - Raw LLM Response: Acute Appendicitis
232
+
233
+ (As a medical assistant, I identify the most representative condition here as acute appendicitis, given the patient's symptom of acute abdominal pain, particularly if localized in the right lower quadrant and accompanied by other typical signs like nausea, vomiting, fever, or guarding. However, this is not a definitive diagnosis and should be confirmed by a physician through clinical evaluation, imaging, or surgical findings.)
234
+ 2025-07-31 06:40:00,096 - llm_clients - INFO - Query Latency: 8.9862 seconds
235
+ 2025-07-31 06:40:00,097 - llm_clients - INFO - Extracted Condition: Acute Appendicitis
236
+ 2025-07-31 06:40:00,097 - user_prompt - INFO - Starting semantic search fallback for query: 'patient presenting with acute abdominal pain'
237
+ Batches: 100%|███████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 10.49it/s]
238
+ 2025-07-31 06:40:00,664 - retrieval - INFO - Sliding window search: Found 5 results
239
+ 2025-07-31 06:40:00,673 - user_prompt - INFO - Semantic search returned 5 results
240
+ Batches: 100%|███████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 15.57it/s]
241
+ Batches: 100%|███████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 50.55it/s]
242
+ Batches: 100%|███████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 47.08it/s]
243
+ Batches: 100%|███████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 62.74it/s]
244
+ Batches: 100%|███████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 45.91it/s]
245
+ Batches: 100%|███████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 61.25it/s]
246
+ Batches: 100%|███████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 58.38it/s]
247
+ Batches: 100%|███████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 64.09it/s]
248
+ 2025-07-31 06:40:00,876 - user_prompt - INFO - Inferred condition: None
249
+ 2025-07-31 06:40:00,876 - user_prompt - WARNING - Condition validation failed for: None
250
+ 2025-07-31 06:40:00,876 - user_prompt - INFO - No suitable condition found in semantic search
251
+ Batches: 100%|███████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 16.32it/s]
252
+ 2025-07-31 06:40:01,399 - retrieval - INFO - Sliding window search: Found 5 results
253
+ Condition: generic medical query
254
+ Emergency keywords: medical|emergency
255
+ Treatment keywords: treatment|management
256
+ Source: generic_search
257
+ Duration: 10.298s
258
+
259
+ Step 2: User confirmation process...
260
+ Confirmation type: confirmation_needed
261
+
262
+ Step 3: Executing retrieval...
263
+ Batches: 100%|███████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 45.41it/s]
264
+ 2025-07-31 06:40:01,432 - retrieval - INFO - Search results: Emergency=5, Treatment=5
265
+ 2025-07-31 06:40:01,432 - retrieval - INFO - Deduplication: Processing 10 results using text matching
266
+ 2025-07-31 06:40:01,432 - retrieval - INFO - Deduplication summary: 10 → 9 results (removed 1)
267
+ Search query: 'medical|emergency treatment|management'
268
+ Total results: 9
269
+ Emergency results: 5
270
+ Treatment results: 4
271
+ Duration: 0.025s
272
+
273
+ Top 3 results:
274
+ 1. Type: treatment, Distance: 0.7708
275
+ Text preview: and nurse practitioners who may or may not be formally trained in emergency medicine. they offer pri...
276
+ 2. Type: emergency, Distance: 0.8056
277
+ Text preview: organization of emergency medical assistance emergency medical assistance is the first aid that is g...
278
+ 3. Type: emergency, Distance: 0.8321
279
+ Text preview: ion to the emergency room ; - urgent situation that requires advanced medical care before transporta...
280
+
281
+ ✅ Test test_005 completed successfully (10.322s)
282
+
283
+ 🔍 test_006: Specific condition with treatment focus
284
+ Query: 'pulmonary embolism treatment guidelines'
285
+ ------------------------------------------------------------
286
+ Step 1: Extracting medical condition and keywords...
287
+ 2025-07-31 06:40:01,432 - user_prompt - INFO - Matched predefined condition: pulmonary embolism
288
+ Condition: pulmonary embolism
289
+ Emergency keywords: chest pain|shortness of breath|sudden dyspnea
290
+ Treatment keywords: anticoagulation|heparin|embolectomy
291
+ Source: predefined_mapping
292
+ Duration: 0.000s
293
+
294
+ Step 2: User confirmation process...
295
+ Confirmation type: confirmation_needed
296
+
297
+ Step 3: Executing retrieval...
298
+ Batches: 100%|███████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 16.27it/s]
299
+ 2025-07-31 06:40:01,562 - retrieval - INFO - Search results: Emergency=5, Treatment=5
300
+ 2025-07-31 06:40:01,562 - retrieval - INFO - Deduplication: Processing 10 results using text matching
301
+ 2025-07-31 06:40:01,562 - retrieval - INFO - Deduplication summary: 10 → 8 results (removed 2)
302
+ Search query: 'chest pain|shortness of breath|sudden dyspnea anticoagulation|heparin|embolectomy'
303
+ Total results: 8
304
+ Emergency results: 5
305
+ Treatment results: 3
306
+ Duration: 0.130s
307
+
308
+ Top 3 results:
309
+ 1. Type: emergency, Distance: 0.8949
310
+ Text preview: algesics ( e. g. morphine, pethidine ) facilities for defibrillation ( df ) aspirin / anticoagulant ...
311
+ 2. Type: treatment, Distance: 0.9196
312
+ Text preview: y proximal deep vein thrombosis leading to acute pulmonary embolism # # common causes of peripheral ...
313
+ 3. Type: emergency, Distance: 0.9216
314
+ Text preview: ed or discolored skin in the affected leg - visible surface veins dvt usually involves the deep vein...
315
+
316
+ ✅ Test test_006 completed successfully (0.130s)
317
+
318
+ ================================================================================
319
+ 📊 COMPREHENSIVE TEST REPORT
320
+ ================================================================================
321
+ 🕐 Execution Summary:
322
+ Start time: 2025-07-31 06:38:22
323
+ End time: 2025-07-31 06:40:01
324
+ Total duration: 98.954s
325
+ Average per test: 16.492s
326
+
327
+ 📈 Test Results:
328
+ Total tests: 6
329
+ Successful: 6 ✅
330
+ Failed: 0 ❌
331
+ Success rate: 100.0%
332
+
333
+ ✅ Successful Tests Analysis:
334
+ Condition extraction sources:
335
+ - predefined_mapping: 3 tests
336
+ - generic_search: 3 tests
337
+ Performance metrics:
338
+ - Avg condition extraction: 14.935s
339
+ - Avg retrieval time: 0.206s
340
+
341
+ 📋 test_001: Classic acute myocardial infarction query
342
+ Query: 'how to treat acute MI?'
343
+ Condition: acute myocardial infarction
344
+ Source: predefined_mapping
345
+ Results: 9 total (4 emergency, 5 treatment)
346
+ Duration: 42.511s
347
+
348
+ 📋 test_002: Symptoms-based query requiring LLM analysis
349
+ Query: 'patient with severe chest pain and shortness of breath'
350
+ Condition: generic medical query
351
+ Source: generic_search
352
+ Results: 9 total (5 emergency, 4 treatment)
353
+ Duration: 20.176s
354
+
355
+ 📋 test_003: Neurological emergency query
356
+ Query: 'sudden neurological symptoms suggesting stroke'
357
+ Condition: generic medical query
358
+ Source: generic_search
359
+ Results: 9 total (5 emergency, 4 treatment)
360
+ Duration: 17.569s
361
+
362
+ 📋 test_004: Protocol-specific stroke query
363
+ Query: 'acute stroke management protocol'
364
+ Condition: acute stroke
365
+ Source: predefined_mapping
366
+ Results: 9 total (5 emergency, 4 treatment)
367
+ Duration: 0.137s
368
+
369
+ 📋 test_005: General symptom requiring LLM analysis
370
+ Query: 'patient presenting with acute abdominal pain'
371
+ Condition: generic medical query
372
+ Source: generic_search
373
+ Results: 9 total (5 emergency, 4 treatment)
374
+ Duration: 10.322s
375
+
376
+ 📋 test_006: Specific condition with treatment focus
377
+ Query: 'pulmonary embolism treatment guidelines'
378
+ Condition: pulmonary embolism
379
+ Source: predefined_mapping
380
+ Results: 8 total (5 emergency, 3 treatment)
381
+ Duration: 0.130s
tests/test_chunk_quality_analysis.py ADDED
@@ -0,0 +1,333 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Chunk Quality Analysis Tests
3
+
4
+ This module analyzes chunk quality and identifies issues with chunk length differences
5
+ between emergency and treatment data processing methods.
6
+
7
+ Author: OnCall.ai Team
8
+ Date: 2025-07-28
9
+ """
10
+
11
+ import sys
12
+ import json
13
+ import numpy as np
14
+ from pathlib import Path
15
+ from typing import List, Dict, Tuple
16
+ import logging
17
+
18
+ # Setup logging
19
+ logging.basicConfig(
20
+ level=logging.INFO,
21
+ format='%(levelname)s:%(name)s:%(message)s'
22
+ )
23
+ logger = logging.getLogger(__name__)
24
+
25
+ # Add src to python path
26
+ current_dir = Path(__file__).parent.resolve()
27
+ project_root = current_dir.parent
28
+ sys.path.append(str(project_root / "src"))
29
+
30
+ from data_processing import DataProcessor #type: ignore
31
+
32
+ class TestChunkQualityAnalysis:
33
+
34
+ def setup_class(self):
35
+ """Initialize test environment"""
36
+ print("\n=== Phase 1: Setting up Chunk Quality Analysis ===")
37
+ self.base_dir = Path(__file__).parent.parent.resolve()
38
+ self.models_dir = self.base_dir / "models"
39
+ self.embeddings_dir = self.models_dir / "embeddings"
40
+
41
+ print(f"• Base directory: {self.base_dir}")
42
+ print(f"• Models directory: {self.models_dir}")
43
+
44
+ # Initialize processor
45
+ self.processor = DataProcessor(base_dir=str(self.base_dir))
46
+ print("• DataProcessor initialized")
47
+
48
+ def test_chunk_length_analysis(self):
49
+ """Detailed analysis of chunk length distribution"""
50
+ print("\n=== Phase 2: Chunk Length Distribution Analysis ===")
51
+
52
+ try:
53
+ # Load chunk data
54
+ print("• Loading chunk data...")
55
+ with open(self.embeddings_dir / "emergency_chunks.json", 'r') as f:
56
+ emergency_chunks = json.load(f)
57
+ with open(self.embeddings_dir / "treatment_chunks.json", 'r') as f:
58
+ treatment_chunks = json.load(f)
59
+
60
+ # Analyze emergency chunks
61
+ em_lengths = [len(chunk['text']) for chunk in emergency_chunks]
62
+ em_token_counts = [chunk.get('token_count', 0) for chunk in emergency_chunks]
63
+
64
+ print(f"\n📊 Emergency Chunks Analysis:")
65
+ print(f"• Total chunks: {len(em_lengths):,}")
66
+ print(f"• Min length: {min(em_lengths)} chars")
67
+ print(f"• Max length: {max(em_lengths)} chars")
68
+ print(f"• Average length: {sum(em_lengths)/len(em_lengths):.2f} chars")
69
+ print(f"• Median length: {sorted(em_lengths)[len(em_lengths)//2]} chars")
70
+
71
+ if any(em_token_counts):
72
+ avg_tokens = sum(em_token_counts)/len(em_token_counts)
73
+ print(f"• Average tokens: {avg_tokens:.2f}")
74
+ print(f"• Chars per token ratio: {(sum(em_lengths)/len(em_lengths)) / avg_tokens:.2f}")
75
+
76
+ # Analyze treatment chunks
77
+ tr_lengths = [len(chunk['text']) for chunk in treatment_chunks]
78
+
79
+ print(f"\n📊 Treatment Chunks Analysis:")
80
+ print(f"• Total chunks: {len(tr_lengths):,}")
81
+ print(f"• Min length: {min(tr_lengths)} chars")
82
+ print(f"• Max length: {max(tr_lengths)} chars")
83
+ print(f"• Average length: {sum(tr_lengths)/len(tr_lengths):.2f} chars")
84
+ print(f"• Median length: {sorted(tr_lengths)[len(tr_lengths)//2]} chars")
85
+
86
+ # Length distribution comparison
87
+ em_avg = sum(em_lengths)/len(em_lengths)
88
+ tr_avg = sum(tr_lengths)/len(tr_lengths)
89
+ ratio = em_avg / tr_avg
90
+
91
+ print(f"\n🔍 Length Distribution Comparison:")
92
+ print(f"• Emergency average: {em_avg:.0f} chars")
93
+ print(f"• Treatment average: {tr_avg:.0f} chars")
94
+ print(f"• Ratio (Emergency/Treatment): {ratio:.1f}x")
95
+
96
+ # Length distribution buckets
97
+ print(f"\n📈 Length Distribution Buckets:")
98
+ buckets = [0, 100, 250, 500, 1000, 2000, 5000]
99
+
100
+ for i in range(len(buckets)-1):
101
+ em_count = sum(1 for l in em_lengths if buckets[i] <= l < buckets[i+1])
102
+ tr_count = sum(1 for l in tr_lengths if buckets[i] <= l < buckets[i+1])
103
+ print(f"• {buckets[i]}-{buckets[i+1]} chars: Emergency={em_count}, Treatment={tr_count}")
104
+
105
+ # Flag potential issues
106
+ if ratio > 5.0:
107
+ print(f"\n⚠️ WARNING: Emergency chunks are {ratio:.1f}x longer than treatment chunks!")
108
+ print(" This suggests different chunking strategies are being used.")
109
+
110
+ print("✅ Chunk length analysis completed")
111
+
112
+ except Exception as e:
113
+ print(f"❌ Error in chunk length analysis: {str(e)}")
114
+ raise
115
+
116
+ def test_chunking_method_comparison(self):
117
+ """Compare the two chunking methods on the same data"""
118
+ print("\n=== Phase 3: Chunking Method Comparison ===")
119
+
120
+ try:
121
+ # Load data
122
+ print("• Loading dataset for comparison...")
123
+ self.processor.load_filtered_data()
124
+
125
+ # Test on multiple samples for better analysis
126
+ sample_size = 5
127
+ samples = self.processor.treatment_data.head(sample_size)
128
+
129
+ method1_results = [] # keyword_centered_chunks
130
+ method2_results = [] # dual_keyword_chunks
131
+
132
+ print(f"• Testing {sample_size} samples with both methods...")
133
+
134
+ for idx, row in samples.iterrows():
135
+ if not row.get('clean_text') or not row.get('treatment_matched'):
136
+ continue
137
+
138
+ text_length = len(row['clean_text'])
139
+ emergency_kw = row.get('matched', '')
140
+ treatment_kw = row['treatment_matched']
141
+
142
+ # Method 1: keyword_centered_chunks (Emergency method)
143
+ chunks1 = self.processor.create_keyword_centered_chunks(
144
+ text=row['clean_text'],
145
+ matched_keywords=emergency_kw,
146
+ chunk_size=256,
147
+ doc_id=f"test_{idx}"
148
+ )
149
+
150
+ # Method 2: dual_keyword_chunks (Treatment method)
151
+ chunks2 = self.processor.create_dual_keyword_chunks(
152
+ text=row['clean_text'],
153
+ emergency_keywords=emergency_kw,
154
+ treatment_keywords=treatment_kw,
155
+ chunk_size=256,
156
+ doc_id=f"test_{idx}"
157
+ )
158
+
159
+ # Collect results
160
+ if chunks1:
161
+ avg_len1 = sum(len(c['text']) for c in chunks1) / len(chunks1)
162
+ method1_results.append({
163
+ 'doc_id': idx,
164
+ 'chunks_count': len(chunks1),
165
+ 'avg_length': avg_len1,
166
+ 'text_length': text_length
167
+ })
168
+
169
+ if chunks2:
170
+ avg_len2 = sum(len(c['text']) for c in chunks2) / len(chunks2)
171
+ method2_results.append({
172
+ 'doc_id': idx,
173
+ 'chunks_count': len(chunks2),
174
+ 'avg_length': avg_len2,
175
+ 'text_length': text_length
176
+ })
177
+
178
+ # Analysis results
179
+ print(f"\n📊 Method Comparison Results:")
180
+
181
+ if method1_results:
182
+ avg_chunks1 = sum(r['chunks_count'] for r in method1_results) / len(method1_results)
183
+ avg_len1 = sum(r['avg_length'] for r in method1_results) / len(method1_results)
184
+ print(f"\n🔹 Keyword-Centered Method (Emergency):")
185
+ print(f"• Average chunks per document: {avg_chunks1:.1f}")
186
+ print(f"• Average chunk length: {avg_len1:.0f} chars")
187
+
188
+ if method2_results:
189
+ avg_chunks2 = sum(r['chunks_count'] for r in method2_results) / len(method2_results)
190
+ avg_len2 = sum(r['avg_length'] for r in method2_results) / len(method2_results)
191
+ print(f"\n🔹 Dual-Keyword Method (Treatment):")
192
+ print(f"• Average chunks per document: {avg_chunks2:.1f}")
193
+ print(f"• Average chunk length: {avg_len2:.0f} chars")
194
+
195
+ if method1_results:
196
+ ratio = avg_len1 / avg_len2
197
+ print(f"\n🔍 Length Ratio: {ratio:.1f}x (Method1 / Method2)")
198
+
199
+ print("✅ Chunking method comparison completed")
200
+
201
+ except Exception as e:
202
+ print(f"❌ Error in method comparison: {str(e)}")
203
+ raise
204
+
205
+ def test_token_vs_character_analysis(self):
206
+ """Analyze token vs character differences in chunking"""
207
+ print("\n=== Phase 4: Token vs Character Analysis ===")
208
+
209
+ try:
210
+ # Load model for tokenization
211
+ print("• Loading embedding model for tokenization...")
212
+ self.processor.load_embedding_model()
213
+
214
+ # Test sample texts
215
+ test_texts = [
216
+ "Patient presents with acute chest pain and shortness of breath.",
217
+ "Emergency treatment for myocardial infarction includes immediate medication.",
218
+ "The patient's vital signs show tachycardia and hypotension requiring intervention."
219
+ ]
220
+
221
+ print(f"\n📊 Token vs Character Analysis:")
222
+
223
+ total_chars = 0
224
+ total_tokens = 0
225
+
226
+ for i, text in enumerate(test_texts, 1):
227
+ char_count = len(text)
228
+ token_count = len(self.processor.tokenizer.tokenize(text))
229
+ ratio = char_count / token_count if token_count > 0 else 0
230
+
231
+ print(f"\nSample {i}:")
232
+ print(f"• Text: {text[:50]}...")
233
+ print(f"• Characters: {char_count}")
234
+ print(f"• Tokens: {token_count}")
235
+ print(f"• Chars/Token ratio: {ratio:.2f}")
236
+
237
+ total_chars += char_count
238
+ total_tokens += token_count
239
+
240
+ overall_ratio = total_chars / total_tokens
241
+ print(f"\n🔍 Overall Character/Token Ratio: {overall_ratio:.2f}")
242
+
243
+ # Estimate chunk sizes
244
+ target_tokens = 256
245
+ estimated_chars = target_tokens * overall_ratio
246
+
247
+ print(f"\n📏 Chunk Size Estimates:")
248
+ print(f"• Target tokens: {target_tokens}")
249
+ print(f"• Estimated characters: {estimated_chars:.0f}")
250
+ print(f"• Current emergency avg: 1842 chars ({1842/overall_ratio:.0f} estimated tokens)")
251
+ print(f"• Current treatment avg: 250 chars ({250/overall_ratio:.0f} estimated tokens)")
252
+
253
+ # Recommendations
254
+ print(f"\n💡 Recommendations:")
255
+ if 1842/overall_ratio > 512:
256
+ print("⚠️ Emergency chunks may exceed model's 512 token limit!")
257
+ if 250/overall_ratio < 64:
258
+ print("⚠️ Treatment chunks may be too short for meaningful context!")
259
+
260
+ print("✅ Token vs character analysis completed")
261
+
262
+ except Exception as e:
263
+ print(f"❌ Error in token analysis: {str(e)}")
264
+ raise
265
+
266
+ def test_generate_recommendations(self):
267
+ """Generate recommendations based on analysis"""
268
+ print("\n=== Phase 5: Generating Recommendations ===")
269
+
270
+ recommendations = []
271
+
272
+ # Based on the known chunk length difference
273
+ recommendations.append({
274
+ 'issue': 'Inconsistent chunk lengths',
275
+ 'description': 'Emergency chunks (1842 chars) are 7x longer than treatment chunks (250 chars)',
276
+ 'recommendation': 'Standardize both methods to use token-based chunking with consistent parameters',
277
+ 'priority': 'HIGH'
278
+ })
279
+
280
+ recommendations.append({
281
+ 'issue': 'Different chunking strategies',
282
+ 'description': 'Emergency uses keyword-centered (token-based), Treatment uses dual-keyword (character-based)',
283
+ 'recommendation': 'Update dual_keyword_chunks to use tokenizer for consistent token-based chunking',
284
+ 'priority': 'HIGH'
285
+ })
286
+
287
+ recommendations.append({
288
+ 'issue': 'Potential token limit overflow',
289
+ 'description': 'Large chunks may exceed PubMedBERT 512 token limit',
290
+ 'recommendation': 'Implement strict token-based chunking with overlap to prevent overflow',
291
+ 'priority': 'MEDIUM'
292
+ })
293
+
294
+ print(f"\n📋 Analysis Recommendations:")
295
+ for i, rec in enumerate(recommendations, 1):
296
+ print(f"\n{i}. {rec['issue']} [{rec['priority']}]")
297
+ print(f" Problem: {rec['description']}")
298
+ print(f" Solution: {rec['recommendation']}")
299
+
300
+ print("\n✅ Recommendations generated")
301
+ return recommendations
302
+
303
+ def main():
304
+ """Run all chunk quality analysis tests"""
305
+ print("\n" + "="*60)
306
+ print("CHUNK QUALITY ANALYSIS TEST SUITE")
307
+ print("="*60)
308
+
309
+ test = TestChunkQualityAnalysis()
310
+ test.setup_class()
311
+
312
+ try:
313
+ test.test_chunk_length_analysis()
314
+ test.test_chunking_method_comparison()
315
+ test.test_token_vs_character_analysis()
316
+ recommendations = test.test_generate_recommendations()
317
+
318
+ print("\n" + "="*60)
319
+ print("🎉 ALL CHUNK QUALITY TESTS COMPLETED SUCCESSFULLY!")
320
+ print("="*60)
321
+ print(f"\nKey Finding: Chunk length inconsistency detected!")
322
+ print(f"Emergency: ~1842 chars, Treatment: ~250 chars (7x difference)")
323
+ print(f"Recommendation: Standardize to token-based chunking")
324
+ print("="*60)
325
+
326
+ except Exception as e:
327
+ print("\n" + "="*60)
328
+ print("❌ CHUNK QUALITY TESTS FAILED!")
329
+ print(f"Error: {str(e)}")
330
+ print("="*60)
331
+
332
+ if __name__ == "__main__":
333
+ main()
tests/test_data_processing.py CHANGED
@@ -12,7 +12,7 @@ import pandas as pd
12
  # Add src to path
13
  sys.path.append(str(Path(__file__).parent.parent.resolve() / "src"))
14
 
15
- from data_processing import DataProcessor
16
  import logging
17
 
18
  # Setup logging
@@ -80,7 +80,7 @@ def test_chunking():
80
  chunks = processor.create_keyword_centered_chunks(
81
  text=row['clean_text'],
82
  matched_keywords=row['matched'],
83
- chunk_size=512,
84
  doc_id=str(row.get('id', idx))
85
  )
86
  emergency_chunks.extend(chunks)
@@ -97,7 +97,7 @@ def test_chunking():
97
  text=row['clean_text'],
98
  emergency_keywords=row.get('matched', ''),
99
  treatment_keywords=row['treatment_matched'],
100
- chunk_size=512,
101
  doc_id=str(row.get('id', idx))
102
  )
103
  treatment_chunks.extend(chunks)
@@ -116,7 +116,7 @@ def test_chunking():
116
  sample_chunk = treatment_chunks[0]
117
  print(f"\nSample treatment chunk:")
118
  print(f" Primary keyword: {sample_chunk['primary_keyword']}")
119
- print(f" Emergency keywords: {sample_chunk['emergency_keywords']}")
120
  print(f" Text length: {len(sample_chunk['text'])}")
121
  print(f" Text preview: {sample_chunk['text'][:100]}...")
122
 
@@ -186,18 +186,109 @@ def test_token_chunking():
186
  print(f"❌ Token chunking test failed: {e}")
187
  return False
188
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
189
  def main():
190
  """Run all tests"""
191
  print("Starting data processing tests...\n")
192
 
193
- # Import pandas here since it's used in chunking test
194
- import pandas as pd
195
-
196
  tests = [
197
  test_data_loading,
198
  test_chunking,
199
  test_model_loading,
200
- test_token_chunking # Added new test
 
201
  ]
202
 
203
  results = []
 
12
  # Add src to path
13
  sys.path.append(str(Path(__file__).parent.parent.resolve() / "src"))
14
 
15
+ from data_processing import DataProcessor #type: ignore
16
  import logging
17
 
18
  # Setup logging
 
80
  chunks = processor.create_keyword_centered_chunks(
81
  text=row['clean_text'],
82
  matched_keywords=row['matched'],
83
+ chunk_size=256, # Updated to use 256 tokens
84
  doc_id=str(row.get('id', idx))
85
  )
86
  emergency_chunks.extend(chunks)
 
97
  text=row['clean_text'],
98
  emergency_keywords=row.get('matched', ''),
99
  treatment_keywords=row['treatment_matched'],
100
+ chunk_size=256, # Updated to use 256 tokens
101
  doc_id=str(row.get('id', idx))
102
  )
103
  treatment_chunks.extend(chunks)
 
116
  sample_chunk = treatment_chunks[0]
117
  print(f"\nSample treatment chunk:")
118
  print(f" Primary keyword: {sample_chunk['primary_keyword']}")
119
+ print(f" Emergency keywords: {sample_chunk.get('emergency_keywords', '')}")
120
  print(f" Text length: {len(sample_chunk['text'])}")
121
  print(f" Text preview: {sample_chunk['text'][:100]}...")
122
 
 
186
  print(f"❌ Token chunking test failed: {e}")
187
  return False
188
 
189
+ def test_dual_keyword_chunks():
190
+ """Test the enhanced dual keyword chunking functionality with token-based approach"""
191
+ print("\n" + "="*50)
192
+ print("TESTING DUAL KEYWORD CHUNKING")
193
+ print("="*50)
194
+
195
+ try:
196
+ processor = DataProcessor()
197
+ processor.load_embedding_model() # Need tokenizer for token count verification
198
+
199
+ # Test case 1: Both emergency and treatment keywords
200
+ print("\nTest Case 1: Both Keywords")
201
+ text = "Patient with acute MI requires immediate IV treatment. Additional chest pain symptoms require aspirin administration."
202
+ emergency_kws = "MI|chest pain"
203
+ treatment_kws = "IV|aspirin"
204
+
205
+ chunks = processor.create_dual_keyword_chunks(
206
+ text=text,
207
+ emergency_keywords=emergency_kws,
208
+ treatment_keywords=treatment_kws,
209
+ chunk_size=256
210
+ )
211
+
212
+ # Verify chunk properties
213
+ for i, chunk in enumerate(chunks):
214
+ print(f"\nChunk {i+1}:")
215
+ # Verify source type
216
+ source_type = chunk.get('source_type')
217
+ assert source_type in ['emergency', 'treatment'], f"Invalid source_type: {source_type}"
218
+ print(f"• Source type: {source_type}")
219
+
220
+ # Verify metadata for treatment chunks
221
+ if source_type == 'treatment':
222
+ contains_em = chunk.get('contains_emergency_kws', [])
223
+ contains_tr = chunk.get('contains_treatment_kws', [])
224
+ match_type = chunk.get('match_type')
225
+ print(f"• Contains Emergency: {contains_em}")
226
+ print(f"• Contains Treatment: {contains_tr}")
227
+ print(f"• Match Type: {match_type}")
228
+ assert match_type in ['both', 'emergency_only', 'treatment_only', 'none'], \
229
+ f"Invalid match_type: {match_type}"
230
+
231
+ # Verify token count
232
+ tokens = processor.tokenizer.tokenize(chunk['text'])
233
+ token_count = len(tokens)
234
+ print(f"• Token count: {token_count}")
235
+ # Allow for overlap
236
+ assert token_count <= 384, f"Chunk too large: {token_count} tokens"
237
+
238
+ # Print text preview
239
+ print(f"• Text preview: {chunk['text'][:100]}...")
240
+
241
+ # Test case 2: Emergency keywords only
242
+ print("\nTest Case 2: Emergency Only")
243
+ text = "Patient presents with severe chest pain and dyspnea."
244
+ emergency_kws = "chest pain"
245
+ treatment_kws = ""
246
+
247
+ chunks = processor.create_dual_keyword_chunks(
248
+ text=text,
249
+ emergency_keywords=emergency_kws,
250
+ treatment_keywords=treatment_kws,
251
+ chunk_size=256
252
+ )
253
+
254
+ assert len(chunks) > 0, "No chunks generated for emergency-only case"
255
+ print(f"✓ Generated {len(chunks)} chunks")
256
+
257
+ # Test case 3: Treatment keywords only
258
+ print("\nTest Case 3: Treatment Only")
259
+ text = "Administer IV fluids and monitor response."
260
+ emergency_kws = ""
261
+ treatment_kws = "IV"
262
+
263
+ chunks = processor.create_dual_keyword_chunks(
264
+ text=text,
265
+ emergency_keywords=emergency_kws,
266
+ treatment_keywords=treatment_kws,
267
+ chunk_size=256
268
+ )
269
+
270
+ assert len(chunks) > 0, "No chunks generated for treatment-only case"
271
+ print(f"✓ Generated {len(chunks)} chunks")
272
+
273
+ print("\n✅ All dual keyword chunking tests passed")
274
+ return True
275
+
276
+ except Exception as e:
277
+ print(f"\n❌ Dual keyword chunking test failed: {e}")
278
+ import traceback
279
+ traceback.print_exc()
280
+ return False
281
+
282
  def main():
283
  """Run all tests"""
284
  print("Starting data processing tests...\n")
285
 
 
 
 
286
  tests = [
287
  test_data_loading,
288
  test_chunking,
289
  test_model_loading,
290
+ test_token_chunking,
291
+ test_dual_keyword_chunks # Added new test
292
  ]
293
 
294
  results = []
tests/test_embedding_and_index.py CHANGED
@@ -1,29 +1,101 @@
 
 
 
 
 
 
 
 
1
  import numpy as np
2
  from annoy import AnnoyIndex
3
  import pytest
4
- from data_processing import DataProcessor
5
-
6
- @pytest.fixture(scope="module")
7
- def processor():
8
- return DataProcessor(base_dir=".")
9
-
10
- def test_embedding_dimensions(processor):
11
- # load emergency embeddings
12
- emb = np.load(processor.models_dir / "embeddings" / "emergency_embeddings.npy")
13
- expected_dim = processor.embedding_dim
14
- assert emb.ndim == 2, f"Expected 2D array, got {emb.ndim}D"
15
- assert emb.shape[1] == expected_dim, (
16
- f"Expected embedding dimension {expected_dim}, got {emb.shape[1]}"
17
- )
18
-
19
- def test_annoy_search(processor):
20
- # load embeddings
21
- emb = np.load(processor.models_dir / "embeddings" / "emergency_embeddings.npy")
22
- # load Annoy index
23
- idx = AnnoyIndex(processor.embedding_dim, 'angular')
24
- idx.load(str(processor.models_dir / "indices" / "annoy" / "emergency_index.ann"))
25
- # perform a sample query
26
- query_vec = emb[0]
27
- ids, distances = idx.get_nns_by_vector(query_vec, 5, include_distances=True)
28
- assert len(ids) == 5
29
- assert all(0 <= d <= 2 for d in distances)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Basic embedding and index validation tests
3
+ """
4
+ # 2025-07-28
5
+ import sys
6
+ from pathlib import Path
7
+ #
8
+
9
  import numpy as np
10
  from annoy import AnnoyIndex
11
  import pytest
12
+
13
+ print("\n=== Phase 1: Initializing Test Environment ===")
14
+ # add src to python path
15
+ current_dir = Path(__file__).parent.resolve()
16
+ project_root = current_dir.parent
17
+ sys.path.append(str(project_root / "src"))
18
+
19
+ print(f"• Current directory: {current_dir}")
20
+ print(f"• Project root: {project_root}")
21
+ print(f"• Python path: {sys.path}")
22
+
23
+ from data_processing import DataProcessor #type: ignore
24
+
25
+
26
+ class TestEmbeddingAndIndex:
27
+ def setup_class(self):
28
+ """初始化測試類"""
29
+ print("\n=== Phase 2: Setting up TestEmbeddingAndIndex ===")
30
+ self.base_dir = Path(__file__).parent.parent.resolve()
31
+ print(f"• Base directory: {self.base_dir}")
32
+ self.processor = DataProcessor(base_dir=str(self.base_dir))
33
+ print("• DataProcessor initialized")
34
+
35
+ def test_embedding_dimensions(self):
36
+ print("\n=== Phase 3: Testing Embedding Dimensions ===")
37
+ print("• Loading emergency embeddings...")
38
+ # load emergency embeddings
39
+ emb = np.load(self.processor.models_dir / "embeddings" / "emergency_embeddings.npy")
40
+ expected_dim = self.processor.embedding_dim
41
+
42
+ print(f"• Loaded embedding shape: {emb.shape}")
43
+ print(f"• Expected dimension: {expected_dim}")
44
+
45
+ assert emb.ndim == 2, f"Expected 2D array, got {emb.ndim}D"
46
+ assert emb.shape[1] == expected_dim, (
47
+ f"Expected embedding dimension {expected_dim}, got {emb.shape[1]}"
48
+ )
49
+ print("✅ Embedding dimensions test passed")
50
+
51
+ def test_annoy_search(self):
52
+ print("\n=== Phase 4: Testing Annoy Search ===")
53
+ print("• Loading embeddings...")
54
+ # load embeddings
55
+ emb = np.load(self.processor.models_dir / "embeddings" / "emergency_embeddings.npy")
56
+ print(f"• Loaded embeddings shape: {emb.shape}")
57
+
58
+ print("• Loading Annoy index...")
59
+ # load Annoy index
60
+ idx = AnnoyIndex(self.processor.embedding_dim, 'angular')
61
+ index_path = self.processor.models_dir / "indices" / "annoy" / "emergency_index.ann"
62
+ print(f"• Index path: {index_path}")
63
+ idx.load(str(index_path))
64
+
65
+ print("• Performing sample query...")
66
+ # perform a sample query
67
+ query_vec = emb[0]
68
+ ids, distances = idx.get_nns_by_vector(query_vec, 5, include_distances=True)
69
+
70
+ print(f"• Search results:")
71
+ print(f" - Found IDs: {ids}")
72
+ print(f" - Distances: {[f'{d:.4f}' for d in distances]}")
73
+
74
+ assert len(ids) == 5, f"Expected 5 results, got {len(ids)}"
75
+ assert all(0 <= d <= 2 for d in distances), "Invalid distance values"
76
+ print("✅ Annoy search test passed")
77
+
78
+ def main():
79
+ """Run tests manually"""
80
+ print("\n" + "="*50)
81
+ print("Starting Embedding and Index Tests")
82
+ print("="*50)
83
+
84
+ test = TestEmbeddingAndIndex()
85
+ test.setup_class() # 手動初始化
86
+
87
+ try:
88
+ test.test_embedding_dimensions()
89
+ test.test_annoy_search()
90
+ print("\n" + "="*50)
91
+ print("🎉 All tests completed successfully!")
92
+ print("="*50)
93
+
94
+ except Exception as e:
95
+ print("\n" + "="*50)
96
+ print("❌ Tests failed!")
97
+ print(f"Error: {str(e)}")
98
+ print("="*50)
99
+
100
+ if __name__ == "__main__":
101
+ main()
tests/test_embedding_validation.py CHANGED
@@ -7,14 +7,27 @@ import numpy as np
7
  import json
8
  import logging
9
  import os
 
10
  from pathlib import Path
11
  from typing import Tuple, List, Optional
12
  from annoy import AnnoyIndex
13
  from sentence_transformers import SentenceTransformer
14
 
 
 
 
 
 
 
 
 
 
 
15
  class TestEmbeddingValidation:
16
  def setup_class(self):
17
  """Initialize test environment with necessary data and models."""
 
 
18
  # Setup logging
19
  logging.basicConfig(
20
  level=logging.DEBUG,
@@ -24,43 +37,57 @@ class TestEmbeddingValidation:
24
  self.logger = logging.getLogger(__name__)
25
 
26
  # Define base paths
27
- self.project_root = Path(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
28
  self.models_dir = self.project_root / "models"
29
  self.embeddings_dir = self.models_dir / "embeddings"
30
  self.indices_dir = self.models_dir / "indices" / "annoy"
31
 
 
 
 
 
32
  self.logger.info(f"Project root: {self.project_root}")
33
  self.logger.info(f"Models directory: {self.models_dir}")
34
  self.logger.info(f"Embeddings directory: {self.embeddings_dir}")
35
 
36
  try:
37
  # Check directory existence
 
38
  if not self.embeddings_dir.exists():
39
  raise FileNotFoundError(f"Embeddings directory not found at: {self.embeddings_dir}")
40
  if not self.indices_dir.exists():
41
  raise FileNotFoundError(f"Indices directory not found at: {self.indices_dir}")
42
 
43
  # Load embeddings
 
44
  self.emergency_emb = np.load(self.embeddings_dir / "emergency_embeddings.npy")
45
  self.treatment_emb = np.load(self.embeddings_dir / "treatment_embeddings.npy")
46
 
47
  # Load chunks
 
48
  with open(self.embeddings_dir / "emergency_chunks.json", 'r') as f:
49
  self.emergency_chunks = json.load(f)
50
  with open(self.embeddings_dir / "treatment_chunks.json", 'r') as f:
51
  self.treatment_chunks = json.load(f)
52
 
53
  # Initialize model
 
54
  self.model = SentenceTransformer("NeuML/pubmedbert-base-embeddings")
55
 
 
 
 
 
56
  self.logger.info("Test environment initialized successfully")
57
  self.logger.info(f"Emergency embeddings shape: {self.emergency_emb.shape}")
58
  self.logger.info(f"Treatment embeddings shape: {self.treatment_emb.shape}")
59
 
60
  except FileNotFoundError as e:
 
61
  self.logger.error(f"File not found: {e}")
62
  raise
63
  except Exception as e:
 
64
  self.logger.error(f"Error during initialization: {e}")
65
  raise
66
 
@@ -84,20 +111,28 @@ class TestEmbeddingValidation:
84
 
85
  def test_embedding_dimensions(self):
86
  """Test embedding dimensions and data quality."""
 
87
  self.logger.info("\n=== Embedding Validation Report ===")
88
 
89
  try:
90
  # Basic dimension checks
 
91
  assert self.emergency_emb.shape[1] == 768, "Emergency embedding dimension should be 768"
92
  assert self.treatment_emb.shape[1] == 768, "Treatment embedding dimension should be 768"
 
 
93
 
94
  # Count verification
 
95
  assert len(self.emergency_chunks) == self.emergency_emb.shape[0], \
96
  "Emergency chunks count mismatch"
97
  assert len(self.treatment_chunks) == self.treatment_emb.shape[0], \
98
  "Treatment chunks count mismatch"
 
 
99
 
100
  # Data quality checks
 
101
  for name, emb in [("Emergency", self.emergency_emb),
102
  ("Treatment", self.treatment_emb)]:
103
  # Check for NaN and Inf
@@ -105,25 +140,35 @@ class TestEmbeddingValidation:
105
  assert not np.isinf(emb).any(), f"{name} contains Inf values"
106
 
107
  # Value distribution analysis
 
 
 
 
 
108
  self.logger.info(f"\n{name} Embeddings Statistics:")
109
  self.logger.info(f"- Range: {np.min(emb):.3f} to {np.max(emb):.3f}")
110
  self.logger.info(f"- Mean: {np.mean(emb):.3f}")
111
  self.logger.info(f"- Std: {np.std(emb):.3f}")
112
 
 
113
  self.logger.info("\n✅ All embedding validations passed")
114
 
115
  except AssertionError as e:
 
116
  self.logger.error(f"Validation failed: {str(e)}")
117
  raise
118
 
119
  def test_multiple_known_item_search(self):
120
  """Test ANNOY search with multiple random samples."""
 
121
  self.logger.info("\n=== Multiple Known-Item Search Test ===")
122
 
 
123
  emergency_index = AnnoyIndex(768, 'angular')
124
  emergency_index.load(str(self.indices_dir / "emergency_index.ann"))
125
 
126
  # Test 20 random samples
 
127
  test_indices = np.random.choice(
128
  self.emergency_emb.shape[0],
129
  size=20,
@@ -131,36 +176,45 @@ class TestEmbeddingValidation:
131
  )
132
 
133
  success_count = 0
134
- for test_idx in test_indices:
 
135
  try:
136
  test_emb = self.emergency_emb[test_idx]
137
  indices, distances = self._safe_search(emergency_index, test_emb)
138
 
139
  if indices is None:
 
140
  continue
141
 
142
  # Verify self-retrieval
143
  assert indices[0] == test_idx, f"Self-retrieval failed for index {test_idx}"
144
  assert distances[0] < 0.0001, f"Self-distance too large for index {test_idx}"
145
  success_count += 1
 
146
 
147
  except AssertionError as e:
 
148
  self.logger.warning(f"Test failed for index {test_idx}: {str(e)}")
149
 
 
150
  self.logger.info(f"\n✅ {success_count}/20 self-retrieval tests passed")
151
  assert success_count >= 18, "Less than 90% of self-retrieval tests passed"
 
152
 
153
  def test_balanced_cross_dataset_search(self):
154
  """Test search across both emergency and treatment datasets."""
 
155
  self.logger.info("\n=== Balanced Cross-Dataset Search Test ===")
156
 
157
  # Initialize indices
 
158
  emergency_index = AnnoyIndex(768, 'angular')
159
  treatment_index = AnnoyIndex(768, 'angular')
160
 
161
  try:
162
  emergency_index.load(str(self.indices_dir / "emergency_index.ann"))
163
  treatment_index.load(str(self.indices_dir / "treatment_index.ann"))
 
164
 
165
  # Test queries
166
  test_queries = [
@@ -169,45 +223,75 @@ class TestEmbeddingValidation:
169
  "What are the emergency procedures for anaphylactic shock?"
170
  ]
171
 
172
- for query in test_queries:
173
- print(f"\n\n=== Query: {query} ===")
 
 
174
 
175
  # Generate query vector
 
176
  query_emb = self.model.encode([query])[0]
177
 
178
  # Get top-5 results from each dataset
 
179
  e_indices, e_distances = self._safe_search(emergency_index, query_emb, k=5)
180
  t_indices, t_distances = self._safe_search(treatment_index, query_emb, k=5)
181
 
182
  if None in [e_indices, e_distances, t_indices, t_distances]:
 
183
  self.logger.error("Search failed for one or both datasets")
184
  continue
185
 
186
  # Print first sentence of each result
187
- print("\nEmergency Dataset Results:")
188
  for i, (idx, dist) in enumerate(zip(e_indices, e_distances), 1):
189
  text = self.emergency_chunks[idx]['text']
190
  first_sentence = text.split('.')[0] + '.'
191
- print(f"\nE-{i} (distance: {dist:.3f}):")
192
- print(first_sentence)
193
 
194
- print("\nTreatment Dataset Results:")
195
  for i, (idx, dist) in enumerate(zip(t_indices, t_distances), 1):
196
  text = self.treatment_chunks[idx]['text']
197
  first_sentence = text.split('.')[0] + '.'
198
- print(f"\nT-{i} (distance: {dist:.3f}):")
199
- print(first_sentence)
 
200
 
201
  except Exception as e:
 
202
  self.logger.error(f"Test failed: {str(e)}")
203
  raise
204
  else:
 
205
  self.logger.info("\n✅ Cross-dataset search test completed")
206
 
207
- if __name__ == "__main__":
208
- # Manual test execution
 
 
 
 
209
  test = TestEmbeddingValidation()
210
  test.setup_class()
211
- test.test_embedding_dimensions()
212
- test.test_multiple_known_item_search()
213
- test.test_balanced_cross_dataset_search()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  import json
8
  import logging
9
  import os
10
+ import sys
11
  from pathlib import Path
12
  from typing import Tuple, List, Optional
13
  from annoy import AnnoyIndex
14
  from sentence_transformers import SentenceTransformer
15
 
16
+ print("\n=== Phase 1: Initializing Test Environment ===")
17
+ # Add src to python path
18
+ current_dir = Path(__file__).parent.resolve()
19
+ project_root = current_dir.parent
20
+ sys.path.append(str(project_root / "src"))
21
+
22
+ print(f"• Current directory: {current_dir}")
23
+ print(f"• Project root: {project_root}")
24
+ print(f"• Python path added: {project_root / 'src'}")
25
+
26
  class TestEmbeddingValidation:
27
  def setup_class(self):
28
  """Initialize test environment with necessary data and models."""
29
+ print("\n=== Phase 2: Setting up Test Environment ===")
30
+
31
  # Setup logging
32
  logging.basicConfig(
33
  level=logging.DEBUG,
 
37
  self.logger = logging.getLogger(__name__)
38
 
39
  # Define base paths
40
+ self.project_root = Path(__file__).parent.parent.resolve()
41
  self.models_dir = self.project_root / "models"
42
  self.embeddings_dir = self.models_dir / "embeddings"
43
  self.indices_dir = self.models_dir / "indices" / "annoy"
44
 
45
+ print(f"• Project root: {self.project_root}")
46
+ print(f"• Models directory: {self.models_dir}")
47
+ print(f"• Embeddings directory: {self.embeddings_dir}")
48
+
49
  self.logger.info(f"Project root: {self.project_root}")
50
  self.logger.info(f"Models directory: {self.models_dir}")
51
  self.logger.info(f"Embeddings directory: {self.embeddings_dir}")
52
 
53
  try:
54
  # Check directory existence
55
+ print("• Checking directory existence...")
56
  if not self.embeddings_dir.exists():
57
  raise FileNotFoundError(f"Embeddings directory not found at: {self.embeddings_dir}")
58
  if not self.indices_dir.exists():
59
  raise FileNotFoundError(f"Indices directory not found at: {self.indices_dir}")
60
 
61
  # Load embeddings
62
+ print("• Loading embeddings...")
63
  self.emergency_emb = np.load(self.embeddings_dir / "emergency_embeddings.npy")
64
  self.treatment_emb = np.load(self.embeddings_dir / "treatment_embeddings.npy")
65
 
66
  # Load chunks
67
+ print("• Loading chunk metadata...")
68
  with open(self.embeddings_dir / "emergency_chunks.json", 'r') as f:
69
  self.emergency_chunks = json.load(f)
70
  with open(self.embeddings_dir / "treatment_chunks.json", 'r') as f:
71
  self.treatment_chunks = json.load(f)
72
 
73
  # Initialize model
74
+ print("• Loading PubMedBERT model...")
75
  self.model = SentenceTransformer("NeuML/pubmedbert-base-embeddings")
76
 
77
+ print(f"• Emergency embeddings shape: {self.emergency_emb.shape}")
78
+ print(f"• Treatment embeddings shape: {self.treatment_emb.shape}")
79
+ print("✅ Test environment initialized successfully")
80
+
81
  self.logger.info("Test environment initialized successfully")
82
  self.logger.info(f"Emergency embeddings shape: {self.emergency_emb.shape}")
83
  self.logger.info(f"Treatment embeddings shape: {self.treatment_emb.shape}")
84
 
85
  except FileNotFoundError as e:
86
+ print(f"❌ File not found: {e}")
87
  self.logger.error(f"File not found: {e}")
88
  raise
89
  except Exception as e:
90
+ print(f"❌ Error during initialization: {e}")
91
  self.logger.error(f"Error during initialization: {e}")
92
  raise
93
 
 
111
 
112
  def test_embedding_dimensions(self):
113
  """Test embedding dimensions and data quality."""
114
+ print("\n=== Phase 3: Embedding Validation ===")
115
  self.logger.info("\n=== Embedding Validation Report ===")
116
 
117
  try:
118
  # Basic dimension checks
119
+ print("• Checking embedding dimensions...")
120
  assert self.emergency_emb.shape[1] == 768, "Emergency embedding dimension should be 768"
121
  assert self.treatment_emb.shape[1] == 768, "Treatment embedding dimension should be 768"
122
+ print(f"✓ Emergency dimensions: {self.emergency_emb.shape}")
123
+ print(f"✓ Treatment dimensions: {self.treatment_emb.shape}")
124
 
125
  # Count verification
126
+ print("• Verifying chunk count consistency...")
127
  assert len(self.emergency_chunks) == self.emergency_emb.shape[0], \
128
  "Emergency chunks count mismatch"
129
  assert len(self.treatment_chunks) == self.treatment_emb.shape[0], \
130
  "Treatment chunks count mismatch"
131
+ print(f"✓ Emergency: {len(self.emergency_chunks)} chunks = {self.emergency_emb.shape[0]} embeddings")
132
+ print(f"✓ Treatment: {len(self.treatment_chunks)} chunks = {self.treatment_emb.shape[0]} embeddings")
133
 
134
  # Data quality checks
135
+ print("• Performing data quality checks...")
136
  for name, emb in [("Emergency", self.emergency_emb),
137
  ("Treatment", self.treatment_emb)]:
138
  # Check for NaN and Inf
 
140
  assert not np.isinf(emb).any(), f"{name} contains Inf values"
141
 
142
  # Value distribution analysis
143
+ print(f"\n📊 {name} Embeddings Statistics:")
144
+ print(f"• Range: {np.min(emb):.3f} to {np.max(emb):.3f}")
145
+ print(f"• Mean: {np.mean(emb):.3f}")
146
+ print(f"• Std: {np.std(emb):.3f}")
147
+
148
  self.logger.info(f"\n{name} Embeddings Statistics:")
149
  self.logger.info(f"- Range: {np.min(emb):.3f} to {np.max(emb):.3f}")
150
  self.logger.info(f"- Mean: {np.mean(emb):.3f}")
151
  self.logger.info(f"- Std: {np.std(emb):.3f}")
152
 
153
+ print("\n✅ All embedding validations passed")
154
  self.logger.info("\n✅ All embedding validations passed")
155
 
156
  except AssertionError as e:
157
+ print(f"❌ Validation failed: {str(e)}")
158
  self.logger.error(f"Validation failed: {str(e)}")
159
  raise
160
 
161
  def test_multiple_known_item_search(self):
162
  """Test ANNOY search with multiple random samples."""
163
+ print("\n=== Phase 4: Multiple Known-Item Search Test ===")
164
  self.logger.info("\n=== Multiple Known-Item Search Test ===")
165
 
166
+ print("• Loading emergency index...")
167
  emergency_index = AnnoyIndex(768, 'angular')
168
  emergency_index.load(str(self.indices_dir / "emergency_index.ann"))
169
 
170
  # Test 20 random samples
171
+ print("• Selecting 20 random samples for self-retrieval test...")
172
  test_indices = np.random.choice(
173
  self.emergency_emb.shape[0],
174
  size=20,
 
176
  )
177
 
178
  success_count = 0
179
+ print("• Testing self-retrieval for each sample...")
180
+ for i, test_idx in enumerate(test_indices, 1):
181
  try:
182
  test_emb = self.emergency_emb[test_idx]
183
  indices, distances = self._safe_search(emergency_index, test_emb)
184
 
185
  if indices is None:
186
+ print(f" {i}/20: ❌ Search failed for index {test_idx}")
187
  continue
188
 
189
  # Verify self-retrieval
190
  assert indices[0] == test_idx, f"Self-retrieval failed for index {test_idx}"
191
  assert distances[0] < 0.0001, f"Self-distance too large for index {test_idx}"
192
  success_count += 1
193
+ print(f" {i}/20: ✓ Index {test_idx} (distance: {distances[0]:.6f})")
194
 
195
  except AssertionError as e:
196
+ print(f" {i}/20: ❌ Index {test_idx} failed: {str(e)}")
197
  self.logger.warning(f"Test failed for index {test_idx}: {str(e)}")
198
 
199
+ print(f"\n📊 Self-Retrieval Results: {success_count}/20 tests passed ({success_count/20*100:.1f}%)")
200
  self.logger.info(f"\n✅ {success_count}/20 self-retrieval tests passed")
201
  assert success_count >= 18, "Less than 90% of self-retrieval tests passed"
202
+ print("✅ Multiple known-item search test passed")
203
 
204
  def test_balanced_cross_dataset_search(self):
205
  """Test search across both emergency and treatment datasets."""
206
+ print("\n=== Phase 5: Cross-Dataset Search Test ===")
207
  self.logger.info("\n=== Balanced Cross-Dataset Search Test ===")
208
 
209
  # Initialize indices
210
+ print("• Loading ANNOY indices...")
211
  emergency_index = AnnoyIndex(768, 'angular')
212
  treatment_index = AnnoyIndex(768, 'angular')
213
 
214
  try:
215
  emergency_index.load(str(self.indices_dir / "emergency_index.ann"))
216
  treatment_index.load(str(self.indices_dir / "treatment_index.ann"))
217
+ print("✓ Emergency and treatment indices loaded")
218
 
219
  # Test queries
220
  test_queries = [
 
223
  "What are the emergency procedures for anaphylactic shock?"
224
  ]
225
 
226
+ print(f"• Testing {len(test_queries)} medical queries...")
227
+
228
+ for query_num, query in enumerate(test_queries, 1):
229
+ print(f"\n🔍 Query {query_num}/3: {query}")
230
 
231
  # Generate query vector
232
+ print("• Generating query embedding...")
233
  query_emb = self.model.encode([query])[0]
234
 
235
  # Get top-5 results from each dataset
236
+ print("• Searching both datasets...")
237
  e_indices, e_distances = self._safe_search(emergency_index, query_emb, k=5)
238
  t_indices, t_distances = self._safe_search(treatment_index, query_emb, k=5)
239
 
240
  if None in [e_indices, e_distances, t_indices, t_distances]:
241
+ print("❌ Search failed for one or both datasets")
242
  self.logger.error("Search failed for one or both datasets")
243
  continue
244
 
245
  # Print first sentence of each result
246
+ print(f"\n📋 Emergency Dataset Results:")
247
  for i, (idx, dist) in enumerate(zip(e_indices, e_distances), 1):
248
  text = self.emergency_chunks[idx]['text']
249
  first_sentence = text.split('.')[0] + '.'
250
+ print(f" E-{i} (distance: {dist:.3f}): {first_sentence[:80]}...")
 
251
 
252
+ print(f"\n📋 Treatment Dataset Results:")
253
  for i, (idx, dist) in enumerate(zip(t_indices, t_distances), 1):
254
  text = self.treatment_chunks[idx]['text']
255
  first_sentence = text.split('.')[0] + '.'
256
+ print(f" T-{i} (distance: {dist:.3f}): {first_sentence[:80]}...")
257
+
258
+ print("✓ Query completed")
259
 
260
  except Exception as e:
261
+ print(f"❌ Test failed: {str(e)}")
262
  self.logger.error(f"Test failed: {str(e)}")
263
  raise
264
  else:
265
+ print("\n✅ Cross-dataset search test completed")
266
  self.logger.info("\n✅ Cross-dataset search test completed")
267
 
268
+ def main():
269
+ """Run all embedding validation tests"""
270
+ print("\n" + "="*60)
271
+ print("COMPREHENSIVE EMBEDDING VALIDATION TEST SUITE")
272
+ print("="*60)
273
+
274
  test = TestEmbeddingValidation()
275
  test.setup_class()
276
+
277
+ try:
278
+ test.test_embedding_dimensions()
279
+ test.test_multiple_known_item_search()
280
+ test.test_balanced_cross_dataset_search()
281
+
282
+ print("\n" + "="*60)
283
+ print("🎉 ALL EMBEDDING VALIDATION TESTS COMPLETED SUCCESSFULLY!")
284
+ print("="*60)
285
+ print("✅ Embedding dimensions validated")
286
+ print("✅ Self-retrieval accuracy confirmed")
287
+ print("✅ Cross-dataset search functionality verified")
288
+ print("="*60)
289
+
290
+ except Exception as e:
291
+ print("\n" + "="*60)
292
+ print("❌ EMBEDDING VALIDATION TESTS FAILED!")
293
+ print(f"Error: {str(e)}")
294
+ print("="*60)
295
+
296
+ if __name__ == "__main__":
297
+ main()
tests/test_end_to_end_pipeline.py ADDED
@@ -0,0 +1,473 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ End-to-End Pipeline Script Test for OnCall.ai
4
+
5
+ Tests the complete pipeline:
6
+ User Input → UserPrompt Processing → Retrieval → Generation → Structured Medical Advice
7
+
8
+ This script validates the entire workflow with realistic medical queries,
9
+ simulating the user confirmation process and generating final medical advice.
10
+
11
+ Author: OnCall.ai Team
12
+ Date: 2025-07-31
13
+ """
14
+
15
+ import sys
16
+ import os
17
+ from pathlib import Path
18
+ import logging
19
+ import json
20
+ import traceback
21
+ from datetime import datetime
22
+ from typing import Dict, List, Any, Optional
23
+
24
+ # Add src directory to Python path
25
+ current_dir = Path(__file__).parent
26
+ project_root = current_dir.parent
27
+ src_dir = project_root / "src"
28
+ sys.path.insert(0, str(src_dir))
29
+
30
+ # Import all pipeline modules
31
+ try:
32
+ from user_prompt import UserPromptProcessor
33
+ from retrieval import BasicRetrievalSystem
34
+ from llm_clients import llm_Med42_70BClient
35
+ from generation import MedicalAdviceGenerator
36
+ from medical_conditions import CONDITION_KEYWORD_MAPPING
37
+ except ImportError as e:
38
+ print(f"❌ Import Error: {e}")
39
+ print(f"Current working directory: {os.getcwd()}")
40
+ print(f"Python path: {sys.path}")
41
+ sys.exit(1)
42
+
43
+ # Configure logging
44
+ logging.basicConfig(
45
+ level=logging.INFO,
46
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
47
+ handlers=[
48
+ logging.StreamHandler(),
49
+ logging.FileHandler(project_root / 'tests' / 'end_to_end_pipeline.log')
50
+ ]
51
+ )
52
+ logger = logging.getLogger(__name__)
53
+
54
+ class EndToEndPipelineTest:
55
+ """Complete pipeline test with realistic medical scenarios"""
56
+
57
+ def __init__(self):
58
+ """Initialize test suite"""
59
+ self.start_time = datetime.now()
60
+ self.test_results = []
61
+ self.components_initialized = False
62
+
63
+ # Pipeline components
64
+ self.llm_client = None
65
+ self.retrieval_system = None
66
+ self.user_prompt_processor = None
67
+ self.medical_generator = None
68
+
69
+ def initialize_complete_pipeline(self):
70
+ """Initialize all pipeline components"""
71
+ print("🔧 Initializing Complete OnCall.ai Pipeline...")
72
+ print("-" * 60)
73
+
74
+ try:
75
+ # Initialize LLM client
76
+ print("1. Initializing Med42-70B Client...")
77
+ self.llm_client = llm_Med42_70BClient()
78
+ print(" ✅ Med42-70B client ready")
79
+
80
+ # Initialize retrieval system
81
+ print("2. Initializing Dual-Index Retrieval System...")
82
+ self.retrieval_system = BasicRetrievalSystem()
83
+ print(" ✅ Emergency & Treatment indices loaded")
84
+
85
+ # Initialize user prompt processor
86
+ print("3. Initializing Multi-Level Prompt Processor...")
87
+ self.user_prompt_processor = UserPromptProcessor(
88
+ llm_client=self.llm_client,
89
+ retrieval_system=self.retrieval_system
90
+ )
91
+ print(" ✅ Fallback validation system ready")
92
+
93
+ # Initialize medical advice generator
94
+ print("4. Initializing Medical Advice Generator...")
95
+ self.medical_generator = MedicalAdviceGenerator(
96
+ llm_client=self.llm_client
97
+ )
98
+ print(" ✅ RAG generation system ready")
99
+
100
+ self.components_initialized = True
101
+ print(f"\n🎉 Complete pipeline initialized successfully!")
102
+
103
+ except Exception as e:
104
+ logger.error(f"Pipeline initialization failed: {e}")
105
+ print(f"❌ Initialization failed: {e}")
106
+ traceback.print_exc()
107
+ self.components_initialized = False
108
+
109
+ def get_realistic_test_queries(self) -> List[Dict[str, Any]]:
110
+ """Define realistic medical queries for end-to-end testing"""
111
+ return [
112
+ {
113
+ "id": "e2e_001",
114
+ "query": "How to treat acute myocardial infarction in emergency department?",
115
+ "description": "Classic cardiac emergency with treatment focus",
116
+ "expected_intention": "treatment",
117
+ "category": "cardiac_emergency",
118
+ "simulated_confirmation": "yes"
119
+ },
120
+ {
121
+ "id": "e2e_002",
122
+ "query": "Patient presenting with severe chest pain and shortness of breath",
123
+ "description": "Symptom-based emergency requiring assessment and treatment",
124
+ "expected_intention": "diagnosis",
125
+ "category": "multi_symptom",
126
+ "simulated_confirmation": "yes"
127
+ },
128
+ {
129
+ "id": "e2e_003",
130
+ "query": "What are the emergency protocols for acute stroke management?",
131
+ "description": "Neurological emergency with protocol focus",
132
+ "expected_intention": "treatment",
133
+ "category": "neurological_emergency",
134
+ "simulated_confirmation": "yes"
135
+ },
136
+ {
137
+ "id": "e2e_004",
138
+ "query": "Differential diagnosis for sudden onset chest pain in young adult",
139
+ "description": "Diagnostic reasoning query",
140
+ "expected_intention": "diagnosis",
141
+ "category": "differential_diagnosis",
142
+ "simulated_confirmation": "yes"
143
+ },
144
+ {
145
+ "id": "e2e_005",
146
+ "query": "Emergency management of pulmonary embolism",
147
+ "description": "Pulmonary emergency requiring immediate intervention",
148
+ "expected_intention": "treatment",
149
+ "category": "pulmonary_emergency",
150
+ "simulated_confirmation": "yes"
151
+ },
152
+ {
153
+ "id": "e2e_006",
154
+ "query": "How to cook pasta properly?",
155
+ "description": "Non-medical query - should be rejected",
156
+ "expected_intention": None,
157
+ "category": "non_medical",
158
+ "simulated_confirmation": "reject_expected"
159
+ }
160
+ ]
161
+
162
+ def run_scripted_end_to_end_tests(self):
163
+ """Execute complete end-to-end tests with realistic queries"""
164
+ if not self.components_initialized:
165
+ print("❌ Cannot run tests: pipeline not initialized")
166
+ return
167
+
168
+ test_queries = self.get_realistic_test_queries()
169
+
170
+ print(f"\n🚀 Starting End-to-End Pipeline Tests")
171
+ print(f"Total test scenarios: {len(test_queries)}")
172
+ print(f"Test started at: {self.start_time.strftime('%Y-%m-%d %H:%M:%S')}")
173
+ print("=" * 80)
174
+
175
+ # Execute all tests
176
+ for test_case in test_queries:
177
+ result = self._execute_single_pipeline_test(test_case)
178
+ self.test_results.append(result)
179
+
180
+ # Generate comprehensive report
181
+ self._generate_end_to_end_report()
182
+ self._save_end_to_end_results()
183
+
184
+ def _execute_single_pipeline_test(self, test_case: Dict[str, Any]) -> Dict[str, Any]:
185
+ """Execute single test through complete pipeline"""
186
+ test_id = test_case["id"]
187
+ query = test_case["query"]
188
+
189
+ print(f"\n🧪 {test_id}: {test_case['description']}")
190
+ print(f"Query: '{query}'")
191
+ print(f"Expected: {test_case['expected_intention']} intention")
192
+ print("-" * 70)
193
+
194
+ pipeline_start = datetime.now()
195
+ result = {
196
+ "test_id": test_id,
197
+ "test_case": test_case,
198
+ "timestamp": datetime.now().isoformat(),
199
+ "success": False,
200
+ "error": None,
201
+ "total_pipeline_time": 0,
202
+ "pipeline_steps": {}
203
+ }
204
+
205
+ try:
206
+ # STEP 1: User Prompt Processing
207
+ print(" 🎯 Step 1: Condition extraction and validation...")
208
+ step1_start = datetime.now()
209
+
210
+ condition_result = self.user_prompt_processor.extract_condition_keywords(query)
211
+ step1_time = (datetime.now() - step1_start).total_seconds()
212
+
213
+ result["pipeline_steps"]["condition_extraction"] = {
214
+ "duration": step1_time,
215
+ "result": condition_result,
216
+ "condition_found": bool(condition_result.get('condition'))
217
+ }
218
+
219
+ print(f" Condition: {condition_result.get('condition', 'None')}")
220
+ print(f" Keywords: Emergency='{condition_result.get('emergency_keywords', 'None')}', Treatment='{condition_result.get('treatment_keywords', 'None')}'")
221
+ print(f" Time: {step1_time:.3f}s")
222
+
223
+ # Check if this is a non-medical query that should be rejected
224
+ if condition_result.get('type') == 'invalid_query':
225
+ print(" 🚫 Non-medical query correctly rejected")
226
+ result["pipeline_steps"]["rejection"] = {
227
+ "reason": "non_medical_query",
228
+ "message": condition_result.get('message', '')
229
+ }
230
+ result["success"] = test_case['category'] == 'non_medical'
231
+ return result
232
+
233
+ # STEP 2: User Confirmation (Simulated)
234
+ print(" 🤝 Step 2: User confirmation (simulated as 'yes')...")
235
+ confirmation = self.user_prompt_processor.handle_user_confirmation(condition_result)
236
+
237
+ result["pipeline_steps"]["confirmation"] = {
238
+ "type": confirmation.get('type', 'unknown'),
239
+ "simulated_response": test_case['simulated_confirmation']
240
+ }
241
+
242
+ if not condition_result.get('condition'):
243
+ print(" ⚠️ No condition extracted, skipping retrieval and generation")
244
+ result["pipeline_steps"]["pipeline_stopped"] = "no_condition"
245
+ return result
246
+
247
+ # STEP 3: Retrieval
248
+ print(" 🔍 Step 3: Medical guideline retrieval...")
249
+ step3_start = datetime.now()
250
+
251
+ search_query = f"{condition_result.get('emergency_keywords', '')} {condition_result.get('treatment_keywords', '')}".strip()
252
+ if not search_query:
253
+ search_query = condition_result.get('condition', query)
254
+
255
+ retrieval_results = self.retrieval_system.search(search_query, top_k=5)
256
+ step3_time = (datetime.now() - step3_start).total_seconds()
257
+
258
+ processed_results = retrieval_results.get('processed_results', [])
259
+ emergency_count = len([r for r in processed_results if r.get('type') == 'emergency'])
260
+ treatment_count = len([r for r in processed_results if r.get('type') == 'treatment'])
261
+
262
+ result["pipeline_steps"]["retrieval"] = {
263
+ "duration": step3_time,
264
+ "search_query": search_query,
265
+ "total_results": len(processed_results),
266
+ "emergency_results": emergency_count,
267
+ "treatment_results": treatment_count
268
+ }
269
+
270
+ print(f" Search Query: '{search_query}'")
271
+ print(f" Results: {len(processed_results)} total ({emergency_count} emergency, {treatment_count} treatment)")
272
+ print(f" Time: {step3_time:.3f}s")
273
+
274
+ # STEP 4: Medical Advice Generation
275
+ print(" 🧠 Step 4: Medical advice generation...")
276
+ step4_start = datetime.now()
277
+
278
+ # Determine intention (simulate intelligent detection)
279
+ intention = test_case.get('expected_intention')
280
+
281
+ medical_advice = self.medical_generator.generate_medical_advice(
282
+ user_query=query,
283
+ retrieval_results=retrieval_results,
284
+ intention=intention
285
+ )
286
+ step4_time = (datetime.now() - step4_start).total_seconds()
287
+
288
+ result["pipeline_steps"]["generation"] = {
289
+ "duration": step4_time,
290
+ "intention_used": intention,
291
+ "confidence_score": medical_advice.get('confidence_score', 0.0),
292
+ "advice_length": len(medical_advice.get('medical_advice', '')),
293
+ "chunks_used": medical_advice.get('query_metadata', {}).get('total_chunks_used', 0)
294
+ }
295
+
296
+ print(f" Intention: {intention}")
297
+ print(f" Confidence: {medical_advice.get('confidence_score', 0.0):.2f}")
298
+ print(f" Advice Length: {len(medical_advice.get('medical_advice', ''))} chars")
299
+ print(f" Chunks Used: {medical_advice.get('query_metadata', {}).get('total_chunks_used', 0)}")
300
+ print(f" Time: {step4_time:.3f}s")
301
+
302
+ # STEP 5: Results Summary
303
+ total_time = (datetime.now() - pipeline_start).total_seconds()
304
+ result["total_pipeline_time"] = total_time
305
+ result["final_medical_advice"] = medical_advice
306
+ result["success"] = True
307
+
308
+ print(f"\n ✅ Pipeline completed successfully!")
309
+ print(f" 📊 Total Time: {total_time:.3f}s")
310
+ print(f" 🩺 Medical Advice Preview:")
311
+ print(f" {medical_advice.get('medical_advice', 'No advice generated')[:150]}...")
312
+
313
+ except Exception as e:
314
+ total_time = (datetime.now() - pipeline_start).total_seconds()
315
+ result["total_pipeline_time"] = total_time
316
+ result["error"] = str(e)
317
+ result["traceback"] = traceback.format_exc()
318
+
319
+ logger.error(f"Pipeline test {test_id} failed: {e}")
320
+ print(f" ❌ Pipeline failed: {e}")
321
+
322
+ return result
323
+
324
+ def _determine_extraction_source(self, condition_result: Dict) -> str:
325
+ """Determine how the condition was extracted"""
326
+ if condition_result.get('semantic_confidence') is not None:
327
+ return "semantic_search"
328
+ elif condition_result.get('generic_confidence') is not None:
329
+ return "generic_search"
330
+ elif condition_result.get('condition') in CONDITION_KEYWORD_MAPPING:
331
+ return "predefined_mapping"
332
+ else:
333
+ return "llm_extraction"
334
+
335
+ def _generate_end_to_end_report(self):
336
+ """Generate comprehensive end-to-end test report"""
337
+ end_time = datetime.now()
338
+ total_duration = (end_time - self.start_time).total_seconds()
339
+
340
+ successful_tests = [r for r in self.test_results if r['success']]
341
+ failed_tests = [r for r in self.test_results if not r['success']]
342
+
343
+ print("\n" + "=" * 80)
344
+ print("📊 END-TO-END PIPELINE TEST REPORT")
345
+ print("=" * 80)
346
+
347
+ # Overall Statistics
348
+ print(f"🕐 Execution Summary:")
349
+ print(f" Test session duration: {total_duration:.3f}s")
350
+ print(f" Average per test: {total_duration/len(self.test_results):.3f}s")
351
+
352
+ print(f"\n📈 Pipeline Results:")
353
+ print(f" Total tests: {len(self.test_results)}")
354
+ print(f" Successful: {len(successful_tests)} ✅")
355
+ print(f" Failed: {len(failed_tests)} ❌")
356
+ print(f" Success rate: {len(successful_tests)/len(self.test_results)*100:.1f}%")
357
+
358
+ # Performance Analysis
359
+ if successful_tests:
360
+ print(f"\n⚡ Performance Analysis:")
361
+
362
+ # Calculate average times for each step
363
+ step_times = {}
364
+ for result in successful_tests:
365
+ for step_name, step_data in result.get('pipeline_steps', {}).items():
366
+ if 'duration' in step_data:
367
+ if step_name not in step_times:
368
+ step_times[step_name] = []
369
+ step_times[step_name].append(step_data['duration'])
370
+
371
+ for step_name, times in step_times.items():
372
+ avg_time = sum(times) / len(times)
373
+ print(f" {step_name.replace('_', ' ').title()}: {avg_time:.3f}s average")
374
+
375
+ # Overall pipeline performance
376
+ total_times = [r['total_pipeline_time'] for r in successful_tests]
377
+ avg_total = sum(total_times) / len(total_times)
378
+ print(f" Complete Pipeline: {avg_total:.3f}s average")
379
+
380
+ # Detailed Results
381
+ print(f"\n📝 Detailed Test Results:")
382
+ for result in self.test_results:
383
+ test_case = result['test_case']
384
+ status = "✅ PASS" if result['success'] else "❌ FAIL"
385
+
386
+ print(f"\n 📋 {result['test_id']}: {status}")
387
+ print(f" Query: '{test_case['query']}'")
388
+ print(f" Category: {test_case['category']}")
389
+ print(f" Total Time: {result['total_pipeline_time']:.3f}s")
390
+
391
+ if result['success']:
392
+ steps = result.get('pipeline_steps', {})
393
+ if 'condition_extraction' in steps:
394
+ condition = steps['condition_extraction']['result'].get('condition', 'None')
395
+ print(f" Condition Extracted: {condition}")
396
+
397
+ if 'generation' in steps:
398
+ confidence = steps['generation'].get('confidence_score', 0.0)
399
+ chunks = steps['generation'].get('chunks_used', 0)
400
+ print(f" Generation: {confidence:.2f} confidence, {chunks} chunks")
401
+
402
+ if 'final_medical_advice' in result:
403
+ advice = result['final_medical_advice'].get('medical_advice', '')
404
+ print(f" Advice Preview: {advice[:100]}...")
405
+ else:
406
+ if result.get('error'):
407
+ print(f" Error: {result['error']}")
408
+ elif 'rejection' in result.get('pipeline_steps', {}):
409
+ print(f" Rejected: {result['pipeline_steps']['rejection']['reason']}")
410
+
411
+ print("\n" + "=" * 80)
412
+
413
+ def _save_end_to_end_results(self):
414
+ """Save detailed test results to JSON file"""
415
+ timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
416
+ filename = project_root / 'tests' / f'end_to_end_pipeline_results_{timestamp}.json'
417
+
418
+ try:
419
+ comprehensive_results = {
420
+ "test_metadata": {
421
+ "test_type": "end_to_end_pipeline",
422
+ "timestamp": datetime.now().isoformat(),
423
+ "session_start": self.start_time.isoformat(),
424
+ "total_duration_seconds": (datetime.now() - self.start_time).total_seconds(),
425
+ "total_tests": len(self.test_results),
426
+ "successful_tests": len([r for r in self.test_results if r['success']]),
427
+ "failed_tests": len([r for r in self.test_results if not r['success']])
428
+ },
429
+ "pipeline_results": self.test_results,
430
+ "component_status": {
431
+ "user_prompt_processor": "operational",
432
+ "retrieval_system": "operational",
433
+ "medical_generator": "operational",
434
+ "med42_llm_client": "operational"
435
+ }
436
+ }
437
+
438
+ with open(filename, 'w', encoding='utf-8') as f:
439
+ json.dump(comprehensive_results, f, indent=2, ensure_ascii=False)
440
+
441
+ print(f"📁 End-to-end test results saved to: {filename}")
442
+
443
+ except Exception as e:
444
+ logger.error(f"Failed to save test results: {e}")
445
+ print(f"⚠️ Failed to save test results: {e}")
446
+
447
+ def main():
448
+ """Main execution function"""
449
+ print("🏥 OnCall.ai Complete End-to-End Pipeline Test")
450
+ print("Testing: User Input → UserPrompt → Retrieval → Generation")
451
+ print("=" * 70)
452
+
453
+ # Initialize test suite
454
+ test_suite = EndToEndPipelineTest()
455
+
456
+ # Initialize complete pipeline
457
+ test_suite.initialize_complete_pipeline()
458
+
459
+ if not test_suite.components_initialized:
460
+ print("❌ Pipeline initialization failed. Cannot proceed with testing.")
461
+ return 1
462
+
463
+ # Run scripted end-to-end tests
464
+ test_suite.run_scripted_end_to_end_tests()
465
+
466
+ print(f"\n🎯 End-to-end testing completed!")
467
+ print("Next step: Create Gradio interface for interactive testing")
468
+
469
+ return 0
470
+
471
+ if __name__ == "__main__":
472
+ exit_code = main()
473
+ sys.exit(exit_code)
tests/test_multilevel_fallback_validation.py ADDED
@@ -0,0 +1,553 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Multi-Level Fallback Validation Test Suite for OnCall.ai
4
+
5
+ This test specifically validates the 5-level fallback mechanism:
6
+ Level 1: Predefined Mapping (Fast Path)
7
+ Level 2: Llama3-Med42-70B Extraction
8
+ Level 3: Semantic Search Fallback
9
+ Level 4: Medical Query Validation
10
+ Level 5: Generic Medical Search
11
+
12
+ Author: OnCall.ai Team
13
+ Date: 2025-07-30
14
+ """
15
+
16
+ import sys
17
+ import os
18
+ from pathlib import Path
19
+ import logging
20
+ import json
21
+ import traceback
22
+ from datetime import datetime
23
+ from typing import Dict, List, Any, Optional
24
+
25
+ # Add src directory to Python path
26
+ current_dir = Path(__file__).parent
27
+ project_root = current_dir.parent
28
+ src_dir = project_root / "src"
29
+ sys.path.insert(0, str(src_dir))
30
+
31
+ # Import our modules
32
+ try:
33
+ from user_prompt import UserPromptProcessor
34
+ from retrieval import BasicRetrievalSystem
35
+ from llm_clients import llm_Med42_70BClient
36
+ from medical_conditions import CONDITION_KEYWORD_MAPPING
37
+ except ImportError as e:
38
+ print(f"❌ Import Error: {e}")
39
+ print(f"Current working directory: {os.getcwd()}")
40
+ print(f"Python path: {sys.path}")
41
+ sys.exit(1)
42
+
43
+ # Configure logging
44
+ logging.basicConfig(
45
+ level=logging.INFO,
46
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
47
+ handlers=[
48
+ logging.StreamHandler(),
49
+ logging.FileHandler(project_root / 'tests' / 'multilevel_fallback_test.log')
50
+ ]
51
+ )
52
+ logger = logging.getLogger(__name__)
53
+
54
+ class MultilevelFallbackTest:
55
+ """Test suite specifically for the 5-level fallback mechanism"""
56
+
57
+ def __init__(self):
58
+ """Initialize test suite"""
59
+ self.start_time = datetime.now()
60
+ self.results = []
61
+ self.components_initialized = False
62
+
63
+ # Component references
64
+ self.llm_client = None
65
+ self.retrieval_system = None
66
+ self.user_prompt_processor = None
67
+
68
+ def initialize_components(self):
69
+ """Initialize all pipeline components"""
70
+ print("🔧 Initializing Components for Multilevel Fallback Test...")
71
+ print("-" * 60)
72
+
73
+ try:
74
+ # Initialize LLM client
75
+ print("1. Initializing Llama3-Med42-70B Client...")
76
+ self.llm_client = llm_Med42_70BClient()
77
+ print(" ✅ LLM client initialized")
78
+
79
+ # Initialize retrieval system
80
+ print("2. Initializing Retrieval System...")
81
+ self.retrieval_system = BasicRetrievalSystem()
82
+ print(" ✅ Retrieval system initialized")
83
+
84
+ # Initialize user prompt processor
85
+ print("3. Initializing User Prompt Processor...")
86
+ self.user_prompt_processor = UserPromptProcessor(
87
+ llm_client=self.llm_client,
88
+ retrieval_system=self.retrieval_system
89
+ )
90
+ print(" ✅ User prompt processor initialized")
91
+
92
+ self.components_initialized = True
93
+ print("\n🎉 All components initialized successfully!")
94
+
95
+ except Exception as e:
96
+ logger.error(f"Component initialization failed: {e}")
97
+ print(f"❌ Component initialization failed: {e}")
98
+ traceback.print_exc()
99
+ self.components_initialized = False
100
+
101
+ def get_multilevel_test_cases(self) -> List[Dict[str, Any]]:
102
+ """Define test cases specifically targeting each fallback level"""
103
+ return [
104
+ # Level 1: Predefined Mapping Tests
105
+ {
106
+ "id": "level1_001",
107
+ "query": "acute myocardial infarction treatment",
108
+ "description": "Level 1: Direct predefined condition match",
109
+ "expected_level": 1,
110
+ "expected_condition": "acute myocardial infarction",
111
+ "expected_source": "predefined_mapping",
112
+ "category": "level1_predefined"
113
+ },
114
+ {
115
+ "id": "level1_002",
116
+ "query": "how to manage acute stroke?",
117
+ "description": "Level 1: Predefined stroke condition",
118
+ "expected_level": 1,
119
+ "expected_condition": "acute stroke",
120
+ "expected_source": "predefined_mapping",
121
+ "category": "level1_predefined"
122
+ },
123
+ {
124
+ "id": "level1_003",
125
+ "query": "pulmonary embolism emergency protocol",
126
+ "description": "Level 1: Predefined PE condition",
127
+ "expected_level": 1,
128
+ "expected_condition": "pulmonary embolism",
129
+ "expected_source": "predefined_mapping",
130
+ "category": "level1_predefined"
131
+ },
132
+
133
+ # Level 2: LLM Extraction Tests
134
+ {
135
+ "id": "level2_001",
136
+ "query": "patient with severe crushing chest pain radiating to left arm",
137
+ "description": "Level 2: Symptom-based query requiring LLM analysis",
138
+ "expected_level": 2,
139
+ "expected_condition": ["acute myocardial infarction", "acute coronary syndrome"],
140
+ "expected_source": "llm_extraction",
141
+ "category": "level2_llm"
142
+ },
143
+ {
144
+ "id": "level2_002",
145
+ "query": "sudden onset weakness on right side with speech difficulty",
146
+ "description": "Level 2: Neurological symptoms requiring LLM",
147
+ "expected_level": 2,
148
+ "expected_condition": ["acute stroke", "cerebrovascular accident"],
149
+ "expected_source": "llm_extraction",
150
+ "category": "level2_llm"
151
+ },
152
+
153
+ # Level 3: Semantic Search Tests
154
+ {
155
+ "id": "level3_001",
156
+ "query": "emergency management of cardiovascular crisis",
157
+ "description": "Level 3: Generic medical terms requiring semantic search",
158
+ "expected_level": 3,
159
+ "expected_source": "semantic_search",
160
+ "category": "level3_semantic"
161
+ },
162
+ {
163
+ "id": "level3_002",
164
+ "query": "urgent neurological intervention protocols",
165
+ "description": "Level 3: Medical terminology requiring semantic fallback",
166
+ "expected_level": 3,
167
+ "expected_source": "semantic_search",
168
+ "category": "level3_semantic"
169
+ },
170
+
171
+ # Level 4a: Non-Medical Query Rejection
172
+ {
173
+ "id": "level4a_001",
174
+ "query": "how to cook pasta properly?",
175
+ "description": "Level 4a: Non-medical query should be rejected",
176
+ "expected_level": 4,
177
+ "expected_result": "invalid_query",
178
+ "expected_source": "validation_rejection",
179
+ "category": "level4a_rejection"
180
+ },
181
+ {
182
+ "id": "level4a_002",
183
+ "query": "best programming language to learn in 2025",
184
+ "description": "Level 4a: Technology query should be rejected",
185
+ "expected_level": 4,
186
+ "expected_result": "invalid_query",
187
+ "expected_source": "validation_rejection",
188
+ "category": "level4a_rejection"
189
+ },
190
+ {
191
+ "id": "level4a_003",
192
+ "query": "weather forecast for tomorrow",
193
+ "description": "Level 4a: Weather query should be rejected",
194
+ "expected_level": 4,
195
+ "expected_result": "invalid_query",
196
+ "expected_source": "validation_rejection",
197
+ "category": "level4a_rejection"
198
+ },
199
+
200
+ # Level 4b + 5: Obscure Medical Terms → Generic Search
201
+ {
202
+ "id": "level4b_001",
203
+ "query": "rare hematologic malignancy treatment approaches",
204
+ "description": "Level 4b→5: Obscure medical query passing validation to generic search",
205
+ "expected_level": 5,
206
+ "expected_condition": "generic medical query",
207
+ "expected_source": "generic_search",
208
+ "category": "level4b_to_5"
209
+ },
210
+ {
211
+ "id": "level4b_002",
212
+ "query": "idiopathic thrombocytopenic purpura management guidelines",
213
+ "description": "Level 4b→5: Rare condition requiring generic medical search",
214
+ "expected_level": 5,
215
+ "expected_condition": "generic medical query",
216
+ "expected_source": "generic_search",
217
+ "category": "level4b_to_5"
218
+ },
219
+ {
220
+ "id": "level4b_003",
221
+ "query": "necrotizing fasciitis surgical intervention protocols",
222
+ "description": "Level 4b→5: Rare emergency condition → generic search",
223
+ "expected_level": 5,
224
+ "expected_condition": "generic medical query",
225
+ "expected_source": "generic_search",
226
+ "category": "level4b_to_5"
227
+ }
228
+ ]
229
+
230
+ def run_single_fallback_test(self, test_case: Dict[str, Any]) -> Dict[str, Any]:
231
+ """Execute a single fallback test case with level detection"""
232
+ test_id = test_case["id"]
233
+ query = test_case["query"]
234
+
235
+ print(f"\n🔍 {test_id}: {test_case['description']}")
236
+ print(f"Query: '{query}'")
237
+ print(f"Expected Level: {test_case.get('expected_level', 'Unknown')}")
238
+ print("-" * 70)
239
+
240
+ result = {
241
+ "test_id": test_id,
242
+ "test_case": test_case,
243
+ "timestamp": datetime.now().isoformat(),
244
+ "success": False,
245
+ "error": None,
246
+ "execution_time": 0,
247
+ "detected_level": None,
248
+ "condition_result": {}
249
+ }
250
+
251
+ start_time = datetime.now()
252
+
253
+ try:
254
+ # Execute condition extraction with level detection
255
+ print("🎯 Executing multilevel fallback...")
256
+ condition_start = datetime.now()
257
+
258
+ condition_result = self.user_prompt_processor.extract_condition_keywords(query)
259
+ condition_time = (datetime.now() - condition_start).total_seconds()
260
+
261
+ # Detect which level was used
262
+ detected_level = self._detect_fallback_level(condition_result)
263
+
264
+ result["condition_result"] = condition_result
265
+ result["detected_level"] = detected_level
266
+ result["execution_time"] = condition_time
267
+
268
+ print(f" ✅ Detected Level: {detected_level}")
269
+ print(f" Condition: {condition_result.get('condition', 'None')}")
270
+ print(f" Emergency Keywords: {condition_result.get('emergency_keywords', 'None')}")
271
+ print(f" Treatment Keywords: {condition_result.get('treatment_keywords', 'None')}")
272
+ print(f" Execution Time: {condition_time:.3f}s")
273
+
274
+ # Validate expected behavior
275
+ validation_result = self._validate_expected_behavior(test_case, detected_level, condition_result)
276
+ result.update(validation_result)
277
+
278
+ if result["success"]:
279
+ print(" 🎉 Test PASSED - Expected behavior achieved")
280
+ else:
281
+ print(f" ⚠️ Test PARTIAL - {result.get('validation_message', 'Unexpected behavior')}")
282
+
283
+ except Exception as e:
284
+ total_time = (datetime.now() - start_time).total_seconds()
285
+ result["execution_time"] = total_time
286
+ result["error"] = str(e)
287
+ result["traceback"] = traceback.format_exc()
288
+
289
+ logger.error(f"Test {test_id} failed: {e}")
290
+ print(f" ❌ Test FAILED: {e}")
291
+
292
+ return result
293
+
294
+ def _detect_fallback_level(self, condition_result: Dict[str, Any]) -> int:
295
+ """
296
+ Detect which fallback level was used based on the condition result.
297
+
298
+ Fallback levels:
299
+ 0: No result or unknown fallback level.
300
+ 1: Predefined Mapping (Fast Path) - The condition matches a predefined mapping.
301
+ 2: Llama3-Med42-70B Extraction - The condition is extracted by the LLM.
302
+ 3: Semantic Search Fallback - The result includes a semantic confidence score.
303
+ 4: Medical Query Validation - The query is deemed invalid (e.g., 'invalid_query').
304
+ 5: Generic Medical Search - The condition is identified as a generic medical query.
305
+
306
+ Args:
307
+ condition_result (Dict[str, Any]): The result of the condition extraction process.
308
+
309
+ Returns:
310
+ int: The detected fallback level (0-5).
311
+ """
312
+ if not condition_result:
313
+ return 0 # No result
314
+
315
+ # Check for validation rejection (Level 4a)
316
+ if condition_result.get('type') == 'invalid_query':
317
+ return 4
318
+
319
+ # Check for generic search (Level 5)
320
+ if condition_result.get('condition') == 'generic medical query':
321
+ return 5
322
+
323
+ # Check for semantic search (Level 3)
324
+ if 'semantic_confidence' in condition_result:
325
+ return 3
326
+
327
+ # Check for predefined mapping (Level 1)
328
+ condition = condition_result.get('condition', '')
329
+ if condition and condition in CONDITION_KEYWORD_MAPPING:
330
+ return 1
331
+
332
+ # Otherwise assume LLM extraction (Level 2)
333
+ if condition:
334
+ return 2
335
+
336
+ return 0 # Unknown
337
+
338
+ def _validate_expected_behavior(self, test_case: Dict[str, Any], detected_level: int,
339
+ condition_result: Dict[str, Any]) -> Dict[str, Any]:
340
+ """Validate if the test behaved as expected"""
341
+ expected_level = test_case.get('expected_level')
342
+ validation_result = {
343
+ "level_match": detected_level == expected_level,
344
+ "condition_match": False,
345
+ "success": False,
346
+ "validation_message": ""
347
+ }
348
+
349
+ # Check level match
350
+ if validation_result["level_match"]:
351
+ validation_result["validation_message"] += f"✅ Level {detected_level} as expected. "
352
+ else:
353
+ validation_result["validation_message"] += f"⚠️ Level {detected_level} != expected {expected_level}. "
354
+
355
+ # Check condition/result match based on test type
356
+ if test_case["category"] == "level4a_rejection":
357
+ # Should be rejected
358
+ validation_result["condition_match"] = condition_result.get('type') == 'invalid_query'
359
+ if validation_result["condition_match"]:
360
+ validation_result["validation_message"] += "✅ Query correctly rejected. "
361
+ else:
362
+ validation_result["validation_message"] += "⚠️ Query should have been rejected. "
363
+
364
+ elif test_case["category"] == "level4b_to_5":
365
+ # Should result in generic medical query
366
+ validation_result["condition_match"] = condition_result.get('condition') == 'generic medical query'
367
+ if validation_result["condition_match"]:
368
+ validation_result["validation_message"] += "✅ Generic medical search triggered. "
369
+ else:
370
+ validation_result["validation_message"] += "⚠️ Should trigger generic medical search. "
371
+
372
+ else:
373
+ # Check expected condition
374
+ expected_conditions = test_case.get('expected_condition', [])
375
+ if isinstance(expected_conditions, str):
376
+ expected_conditions = [expected_conditions]
377
+
378
+ actual_condition = condition_result.get('condition', '')
379
+ validation_result["condition_match"] = any(
380
+ expected.lower() in actual_condition.lower()
381
+ for expected in expected_conditions
382
+ )
383
+
384
+ if validation_result["condition_match"]:
385
+ validation_result["validation_message"] += f"✅ Condition '{actual_condition}' matches expected. "
386
+ else:
387
+ validation_result["validation_message"] += f"⚠️ Condition '{actual_condition}' != expected {expected_conditions}. "
388
+
389
+ # Overall success
390
+ validation_result["success"] = validation_result["level_match"] or validation_result["condition_match"]
391
+
392
+ return validation_result
393
+
394
+ def run_all_fallback_tests(self):
395
+ """Execute all fallback tests and generate report"""
396
+ if not self.components_initialized:
397
+ print("❌ Cannot run tests: components not initialized")
398
+ return
399
+
400
+ test_cases = self.get_multilevel_test_cases()
401
+
402
+ print(f"\n🚀 Starting Multilevel Fallback Test Suite")
403
+ print(f"Total test cases: {len(test_cases)}")
404
+ print(f"Test started at: {self.start_time.strftime('%Y-%m-%d %H:%M:%S')}")
405
+ print("=" * 80)
406
+
407
+ # Execute all tests
408
+ for test_case in test_cases:
409
+ result = self.run_single_fallback_test(test_case)
410
+ self.results.append(result)
411
+
412
+ # Generate report
413
+ self.generate_fallback_report()
414
+ self.save_fallback_results()
415
+
416
+ def generate_fallback_report(self):
417
+ """Generate detailed fallback analysis report"""
418
+ end_time = datetime.now()
419
+ total_duration = (end_time - self.start_time).total_seconds()
420
+
421
+ successful_tests = [r for r in self.results if r['success']]
422
+ failed_tests = [r for r in self.results if not r['success']]
423
+ partial_tests = [r for r in self.results if not r['success'] and not r.get('error')]
424
+
425
+ print("\n" + "=" * 80)
426
+ print("📊 MULTILEVEL FALLBACK TEST REPORT")
427
+ print("=" * 80)
428
+
429
+ # Overall Statistics
430
+ print(f"🕐 Execution Summary:")
431
+ print(f" Total duration: {total_duration:.3f}s")
432
+ print(f" Average per test: {total_duration/len(self.results):.3f}s")
433
+
434
+ print(f"\n📈 Test Results:")
435
+ print(f" Total tests: {len(self.results)}")
436
+ print(f" Passed: {len(successful_tests)} ✅")
437
+ print(f" Partial: {len(partial_tests)} ⚠️")
438
+ print(f" Failed: {len(failed_tests)} ❌")
439
+ print(f" Success rate: {len(successful_tests)/len(self.results)*100:.1f}%")
440
+
441
+ # Level Distribution Analysis
442
+ level_distribution = {}
443
+ level_performance = {}
444
+
445
+ for result in self.results:
446
+ if not result.get('error'):
447
+ level = result.get('detected_level', 0)
448
+ level_distribution[level] = level_distribution.get(level, 0) + 1
449
+
450
+ if level not in level_performance:
451
+ level_performance[level] = []
452
+ level_performance[level].append(result['execution_time'])
453
+
454
+ print(f"\n🎯 Level Distribution Analysis:")
455
+ for level in sorted(level_distribution.keys()):
456
+ count = level_distribution[level]
457
+ avg_time = sum(level_performance[level]) / len(level_performance[level])
458
+ level_name = {
459
+ 1: "Predefined Mapping",
460
+ 2: "LLM Extraction",
461
+ 3: "Semantic Search",
462
+ 4: "Validation Rejection",
463
+ 5: "Generic Search"
464
+ }.get(level, f"Unknown ({level})")
465
+
466
+ print(f" Level {level} ({level_name}): {count} tests, avg {avg_time:.3f}s")
467
+
468
+ # Category Analysis
469
+ categories = {}
470
+ for result in self.results:
471
+ category = result['test_case']['category']
472
+ if category not in categories:
473
+ categories[category] = {'total': 0, 'passed': 0}
474
+ categories[category]['total'] += 1
475
+ if result['success']:
476
+ categories[category]['passed'] += 1
477
+
478
+ print(f"\n📋 Category Analysis:")
479
+ for category, stats in categories.items():
480
+ success_rate = stats['passed'] / stats['total'] * 100
481
+ print(f" {category}: {stats['passed']}/{stats['total']} ({success_rate:.1f}%)")
482
+
483
+ # Detailed Results
484
+ print(f"\n📝 Detailed Test Results:")
485
+ for result in self.results:
486
+ test_case = result['test_case']
487
+ status = "✅ PASS" if result['success'] else ("❌ FAIL" if result.get('error') else "⚠️ PARTIAL")
488
+
489
+ print(f"\n {result['test_id']}: {status}")
490
+ print(f" Query: '{test_case['query']}'")
491
+ print(f" Expected Level: {test_case.get('expected_level', 'N/A')}")
492
+ print(f" Detected Level: {result.get('detected_level', 'N/A')}")
493
+ print(f" Condition: {result.get('condition_result', {}).get('condition', 'None')}")
494
+ print(f" Time: {result['execution_time']:.3f}s")
495
+
496
+ if result.get('validation_message'):
497
+ print(f" Validation: {result['validation_message']}")
498
+
499
+ if result.get('error'):
500
+ print(f" Error: {result['error']}")
501
+
502
+ print("\n" + "=" * 80)
503
+
504
+ def save_fallback_results(self):
505
+ """Save detailed test results to JSON file"""
506
+ timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
507
+ filename = project_root / 'tests' / f'multilevel_fallback_results_{timestamp}.json'
508
+
509
+ try:
510
+ comprehensive_results = {
511
+ "test_metadata": {
512
+ "timestamp": datetime.now().isoformat(),
513
+ "test_type": "multilevel_fallback_validation",
514
+ "total_duration_seconds": (datetime.now() - self.start_time).total_seconds(),
515
+ "total_tests": len(self.results),
516
+ "passed_tests": len([r for r in self.results if r['success']]),
517
+ "failed_tests": len([r for r in self.results if not r['success']])
518
+ },
519
+ "fallback_results": self.results
520
+ }
521
+
522
+ with open(filename, 'w', encoding='utf-8') as f:
523
+ json.dump(comprehensive_results, f, indent=2, ensure_ascii=False)
524
+
525
+ print(f"📁 Multilevel fallback results saved to: {filename}")
526
+
527
+ except Exception as e:
528
+ logger.error(f"Failed to save test results: {e}")
529
+ print(f"⚠️ Failed to save test results: {e}")
530
+
531
+ def main():
532
+ """Main execution function"""
533
+ print("🏥 OnCall.ai Multilevel Fallback Validation Test")
534
+ print("=" * 60)
535
+
536
+ # Initialize test suite
537
+ test_suite = MultilevelFallbackTest()
538
+
539
+ # Initialize components
540
+ test_suite.initialize_components()
541
+
542
+ if not test_suite.components_initialized:
543
+ print("❌ Test suite initialization failed. Exiting.")
544
+ return 1
545
+
546
+ # Run all fallback tests
547
+ test_suite.run_all_fallback_tests()
548
+
549
+ return 0
550
+
551
+ if __name__ == "__main__":
552
+ exit_code = main()
553
+ sys.exit(exit_code)
tests/test_retrieval.py ADDED
@@ -0,0 +1,206 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Test suite for BasicRetrievalSystem
3
+ This module tests the core retrieval functionality including:
4
+ - System initialization
5
+ - Basic search functionality
6
+ - Deduplication logic
7
+ - Result formatting
8
+ """
9
+
10
+ import sys
11
+ import os
12
+ from pathlib import Path
13
+ import logging
14
+
15
+ print("\n=== Phase 1: Initializing Test Environment ===")
16
+ # Add src to python path
17
+ current_dir = Path(__file__).parent.resolve()
18
+ project_root = current_dir.parent
19
+ sys.path.append(str(project_root / "src"))
20
+
21
+ print(f"• Current directory: {current_dir}")
22
+ print(f"• Project root: {project_root}")
23
+ print(f"• Python path added: {project_root / 'src'}")
24
+
25
+ # Change working directory to project root for file access
26
+ os.chdir(project_root)
27
+ print(f"• Changed working directory to: {project_root}")
28
+
29
+ from retrieval import BasicRetrievalSystem #type: ignore
30
+
31
+ class TestRetrievalSystem:
32
+ """Test suite for basic retrieval system functionality"""
33
+
34
+ def setup_class(self):
35
+ """Initialize test environment"""
36
+ print("\n=== Phase 2: Setting up Test Environment ===")
37
+
38
+ # Setup logging to capture our logs
39
+ logging.basicConfig(
40
+ level=logging.INFO,
41
+ format='%(levelname)s:%(name)s:%(message)s',
42
+ handlers=[
43
+ logging.StreamHandler(),
44
+ logging.FileHandler('test_retrieval.log')
45
+ ]
46
+ )
47
+
48
+ try:
49
+ print("• Initializing BasicRetrievalSystem...")
50
+ self.retrieval = BasicRetrievalSystem(embedding_dim=768)
51
+ print("✅ Retrieval system initialized successfully")
52
+
53
+ except Exception as e:
54
+ print(f"❌ Failed to initialize retrieval system: {e}")
55
+ raise
56
+
57
+ def test_system_initialization(self):
58
+ """Test system initialization components"""
59
+ print("\n=== Phase 3: System Initialization Test ===")
60
+
61
+ print("• Checking embedding model...")
62
+ assert self.retrieval.embedding_model is not None, "Embedding model not loaded"
63
+ print("✓ Embedding model loaded")
64
+
65
+ print("• Checking emergency index...")
66
+ assert self.retrieval.emergency_index is not None, "Emergency index not loaded"
67
+ print("✓ Emergency index loaded")
68
+
69
+ print("• Checking treatment index...")
70
+ assert self.retrieval.treatment_index is not None, "Treatment index not loaded"
71
+ print("✓ Treatment index loaded")
72
+
73
+ print("• Checking chunk data...")
74
+ assert len(self.retrieval.emergency_chunks) > 0, "Emergency chunks not loaded"
75
+ assert len(self.retrieval.treatment_chunks) > 0, "Treatment chunks not loaded"
76
+ print(f"✓ Emergency chunks: {len(self.retrieval.emergency_chunks)}")
77
+ print(f"✓ Treatment chunks: {len(self.retrieval.treatment_chunks)}")
78
+
79
+ print("✅ System initialization test passed")
80
+
81
+ def test_basic_search_functionality(self):
82
+ """Test basic search functionality with medical queries"""
83
+ print("\n=== Phase 4: Basic Search Functionality Test ===")
84
+
85
+ test_queries = [
86
+ "What is the treatment for acute myocardial infarction?",
87
+ "How to manage chest pain in emergency?",
88
+ "Acute stroke treatment protocol"
89
+ ]
90
+
91
+ for i, query in enumerate(test_queries, 1):
92
+ print(f"\n🔍 Test Query {i}/3: {query}")
93
+
94
+ try:
95
+ results = self.retrieval.search(query)
96
+
97
+ # Basic structure checks
98
+ assert "query" in results, "Query not in results"
99
+ assert "processed_results" in results, "Processed results not found"
100
+ assert "total_results" in results, "Total results count missing"
101
+
102
+ processed_results = results["processed_results"]
103
+ print(f"• Results returned: {len(processed_results)}")
104
+
105
+ # Check result format and display ALL results
106
+ for j, result in enumerate(processed_results, 1): # Show ALL results
107
+ assert "type" in result, f"Result {j} missing 'type' field"
108
+ assert "text" in result, f"Result {j} missing 'text' field"
109
+ assert "distance" in result, f"Result {j} missing 'distance' field"
110
+ assert "chunk_id" in result, f"Result {j} missing 'chunk_id' field"
111
+
112
+ print(f" R-{j:2d} [{result['type']:9s}] (distance: {result['distance']:.3f}): {result['text'][:80]}...")
113
+
114
+ print(f"✓ Query {i} completed successfully")
115
+
116
+ except Exception as e:
117
+ print(f"❌ Query {i} failed: {e}")
118
+ raise
119
+
120
+ print("\n✅ Basic search functionality test passed")
121
+
122
+ def test_deduplication_logic(self):
123
+ """Test the text-based deduplication logic"""
124
+ print("\n=== Phase 5: Deduplication Logic Test ===")
125
+
126
+ # Create test data with duplicate texts
127
+ test_results = [
128
+ {"text": "Sample text 1", "distance": 0.1, "type": "emergency", "chunk_id": 1},
129
+ {"text": "Sample text 1", "distance": 0.105, "type": "emergency", "chunk_id": 2}, # Duplicate text
130
+ {"text": "Sample text 3", "distance": 0.2, "type": "treatment", "chunk_id": 3},
131
+ {"text": "Sample text 4", "distance": 0.3, "type": "treatment", "chunk_id": 4}
132
+ ]
133
+
134
+ print(f"• Original results: {len(test_results)}")
135
+ for i, result in enumerate(test_results, 1):
136
+ print(f" Test-{i}: distance={result['distance']}, type={result['type']}")
137
+
138
+ # Test deduplication
139
+ unique_results = self.retrieval._remove_duplicates(test_results)
140
+
141
+ print(f"• After deduplication: {len(unique_results)}")
142
+ for i, result in enumerate(unique_results, 1):
143
+ print(f" Kept-{i}: distance={result['distance']}, type={result['type']}")
144
+
145
+ # Verify deduplication worked
146
+ assert len(unique_results) < len(test_results), "Deduplication should remove duplicate texts"
147
+ print("✓ Text-based deduplication working correctly")
148
+
149
+ print("✅ Deduplication logic test passed")
150
+
151
+ def test_result_statistics(self):
152
+ """Test result statistics and logging"""
153
+ print("\n=== Phase 6: Result Statistics Test ===")
154
+
155
+ query = "Emergency cardiac arrest management"
156
+ print(f"• Testing with query: {query}")
157
+
158
+ # Capture logs by running search
159
+ results = self.retrieval.search(query)
160
+
161
+ # Verify we get statistics
162
+ assert "total_results" in results, "Total results missing"
163
+ assert "processing_info" in results, "Processing info missing"
164
+
165
+ total_results = results["total_results"]
166
+ duplicates_removed = results["processing_info"]["duplicates_removed"]
167
+
168
+ print(f"• Total results: {total_results}")
169
+ print(f"• Duplicates removed: {duplicates_removed}")
170
+ print("✓ Statistics logging working correctly")
171
+
172
+ print("✅ Result statistics test passed")
173
+
174
+ def main():
175
+ """Run all retrieval system tests"""
176
+ print("\n" + "="*60)
177
+ print("COMPREHENSIVE RETRIEVAL SYSTEM TEST SUITE")
178
+ print("="*60)
179
+
180
+ test = TestRetrievalSystem()
181
+
182
+ try:
183
+ test.setup_class()
184
+ test.test_system_initialization()
185
+ test.test_basic_search_functionality()
186
+ test.test_deduplication_logic()
187
+ test.test_result_statistics()
188
+
189
+ print("\n" + "="*60)
190
+ print("🎉 ALL RETRIEVAL SYSTEM TESTS COMPLETED SUCCESSFULLY!")
191
+ print("="*60)
192
+ print("✅ System initialization validated")
193
+ print("✅ Basic search functionality confirmed")
194
+ print("✅ Text-based deduplication working")
195
+ print("✅ Result statistics and logging verified")
196
+ print("="*60)
197
+
198
+ except Exception as e:
199
+ print("\n" + "="*60)
200
+ print("❌ RETRIEVAL SYSTEM TESTS FAILED!")
201
+ print(f"Error: {str(e)}")
202
+ print("="*60)
203
+ raise
204
+
205
+ if __name__ == "__main__":
206
+ main()
tests/test_user_prompt.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ User Prompt Processor Test Suite
3
+
4
+ Comprehensive unit tests for UserPromptProcessor class
5
+ Ensures robust functionality across medical query scenarios.
6
+ """
7
+
8
+ import pytest
9
+ import sys
10
+ from pathlib import Path
11
+
12
+ # Dynamically add project root to Python path
13
+ project_root = Path(__file__).parent.parent
14
+ sys.path.insert(0, str(project_root / "src"))
15
+
16
+ from user_prompt import UserPromptProcessor
17
+
18
+ class TestUserPromptProcessor:
19
+ """Test suite for UserPromptProcessor functionality"""
20
+
21
+ def setup_method(self):
22
+ """Initialize test environment before each test method"""
23
+ self.processor = UserPromptProcessor()
24
+
25
+ def test_extract_condition_keywords_predefined(self):
26
+ """Test predefined condition extraction"""
27
+ query = "heart attack symptoms"
28
+ result = self.processor.extract_condition_keywords(query)
29
+
30
+ assert result is not None
31
+ assert 'condition' in result
32
+ assert 'emergency_keywords' in result
33
+ assert 'treatment_keywords' in result
34
+
35
+ def test_handle_matching_failure_level1(self):
36
+ """Test loose keyword matching mechanism"""
37
+ test_queries = [
38
+ "urgent medical help",
39
+ "critical condition",
40
+ "severe symptoms"
41
+ ]
42
+
43
+ for query in test_queries:
44
+ result = self.processor._handle_matching_failure_level1(query)
45
+
46
+ assert result is not None
47
+ assert result['type'] == 'loose_keyword_match'
48
+ assert result['confidence'] == 0.5
49
+
50
+ def test_semantic_search_fallback(self):
51
+ """Verify semantic search fallback mechanism"""
52
+ test_queries = [
53
+ "how to manage chest pain",
54
+ "treatment for acute stroke",
55
+ "emergency cardiac care"
56
+ ]
57
+
58
+ for query in test_queries:
59
+ result = self.processor._semantic_search_fallback(query)
60
+
61
+ # Result can be None if no match found
62
+ if result is not None:
63
+ assert 'condition' in result
64
+ assert 'emergency_keywords' in result
65
+ assert 'treatment_keywords' in result
66
+
67
+ def test_validate_keywords(self):
68
+ """Test keyword validation functionality"""
69
+ valid_keywords = {
70
+ 'emergency_keywords': 'urgent|critical',
71
+ 'treatment_keywords': 'medication|therapy'
72
+ }
73
+
74
+ invalid_keywords = {
75
+ 'emergency_keywords': '',
76
+ 'treatment_keywords': ''
77
+ }
78
+
79
+ assert self.processor.validate_keywords(valid_keywords) is True
80
+ assert self.processor.validate_keywords(invalid_keywords) is False
81
+
82
+ def main():
83
+ """Run comprehensive test suite with detailed reporting"""
84
+ print("\n" + "="*60)
85
+ print("OnCall.ai: User Prompt Processor Test Suite")
86
+ print("="*60)
87
+
88
+ # Run pytest with verbose output
89
+ pytest.main([__file__, '-v', '--tb=short'])
90
+
91
+ if __name__ == "__main__":
92
+ main()
tests/test_userinput_userprompt_medical_condition_llm_retrieval.py ADDED
@@ -0,0 +1,479 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Comprehensive Test Suite for OnCall.ai Medical Query Processing Pipeline
4
+
5
+ This test validates the complete flow:
6
+ User Input → UserPrompt Processing → Medical Condition Extraction → LLM Analysis → Retrieval
7
+
8
+ Test Components:
9
+ - UserPromptProcessor (condition extraction, keyword mapping)
10
+ - MedicalConditions (predefined mappings, validation)
11
+ - LLM Client (Llama3-Med42-70B condition extraction)
12
+ - BasicRetrievalSystem (vector search, result processing)
13
+
14
+ Author: OnCall.ai Team
15
+ Date: 2025-07-30
16
+ """
17
+
18
+ import sys
19
+ import os
20
+ from pathlib import Path
21
+ import logging
22
+ import json
23
+ import traceback
24
+ from datetime import datetime
25
+ from typing import Dict, List, Any
26
+
27
+ # Add src directory to Python path
28
+ current_dir = Path(__file__).parent
29
+ project_root = current_dir.parent
30
+ src_dir = project_root / "src"
31
+ sys.path.insert(0, str(src_dir))
32
+
33
+ # Import our modules
34
+ try:
35
+ from user_prompt import UserPromptProcessor
36
+ from retrieval import BasicRetrievalSystem
37
+ from llm_clients import llm_Med42_70BClient
38
+ from medical_conditions import CONDITION_KEYWORD_MAPPING, validate_condition, get_condition_details
39
+ except ImportError as e:
40
+ print(f"❌ Import Error: {e}")
41
+ print(f"Current working directory: {os.getcwd()}")
42
+ print(f"Python path: {sys.path}")
43
+ sys.exit(1)
44
+
45
+ # Configure comprehensive logging
46
+ logging.basicConfig(
47
+ level=logging.INFO,
48
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
49
+ handlers=[
50
+ logging.StreamHandler(),
51
+ logging.FileHandler(project_root / 'tests' / 'pipeline_test.log')
52
+ ]
53
+ )
54
+ logger = logging.getLogger(__name__)
55
+
56
+ class MedicalQueryPipelineTest:
57
+ """Comprehensive test suite for the medical query processing pipeline"""
58
+
59
+ def __init__(self):
60
+ """Initialize test suite with all required components"""
61
+ self.start_time = datetime.now()
62
+ self.results = []
63
+ self.components_initialized = False
64
+
65
+ # Component references
66
+ self.llm_client = None
67
+ self.retrieval_system = None
68
+ self.user_prompt_processor = None
69
+
70
+ def initialize_components(self):
71
+ """Initialize all pipeline components with error handling"""
72
+ print("🔧 Initializing Pipeline Components...")
73
+ print("-" * 50)
74
+
75
+ try:
76
+ # Initialize LLM client
77
+ print("1. Initializing Llama3-Med42-70B Client...")
78
+ self.llm_client = llm_Med42_70BClient()
79
+ print(" ✅ LLM client initialized successfully")
80
+
81
+ # Initialize retrieval system
82
+ print("2. Initializing Retrieval System...")
83
+ self.retrieval_system = BasicRetrievalSystem()
84
+ print(" ✅ Retrieval system initialized successfully")
85
+
86
+ # Initialize user prompt processor
87
+ print("3. Initializing User Prompt Processor...")
88
+ self.user_prompt_processor = UserPromptProcessor(
89
+ llm_client=self.llm_client,
90
+ retrieval_system=self.retrieval_system
91
+ )
92
+ print(" ✅ User prompt processor initialized successfully")
93
+
94
+ self.components_initialized = True
95
+ print("\n🎉 All components initialized successfully!")
96
+
97
+ except Exception as e:
98
+ logger.error(f"Component initialization failed: {e}")
99
+ print(f"❌ Component initialization failed: {e}")
100
+ traceback.print_exc()
101
+ self.components_initialized = False
102
+
103
+ def get_test_queries(self) -> List[Dict[str, Any]]:
104
+ """Define comprehensive test queries with expected behavior"""
105
+ return [
106
+ {
107
+ "id": "test_001",
108
+ "query": "how to treat acute MI?",
109
+ "description": "Classic acute myocardial infarction query",
110
+ "expected_condition": "acute myocardial infarction",
111
+ "expected_mechanism": "predefined_mapping",
112
+ "category": "cardiac_emergency"
113
+ },
114
+ {
115
+ "id": "test_002",
116
+ "query": "patient with severe chest pain and shortness of breath",
117
+ "description": "Symptoms-based query requiring LLM analysis",
118
+ "expected_condition": ["acute myocardial infarction", "pulmonary embolism", "acute coronary syndrome"],
119
+ "expected_mechanism": "llm_extraction",
120
+ "category": "cardiac_pulmonary"
121
+ },
122
+ {
123
+ "id": "test_003",
124
+ "query": "sudden neurological symptoms suggesting stroke",
125
+ "description": "Neurological emergency query",
126
+ "expected_condition": "acute stroke",
127
+ "expected_mechanism": "predefined_mapping",
128
+ "category": "neurological_emergency"
129
+ },
130
+ {
131
+ "id": "test_004",
132
+ "query": "acute stroke management protocol",
133
+ "description": "Protocol-specific stroke query",
134
+ "expected_condition": "acute stroke",
135
+ "expected_mechanism": "predefined_mapping",
136
+ "category": "neurological_protocol"
137
+ },
138
+ {
139
+ "id": "test_005",
140
+ "query": "patient presenting with acute abdominal pain",
141
+ "description": "General symptom requiring LLM analysis",
142
+ "expected_condition": "unknown",
143
+ "expected_mechanism": "semantic_fallback",
144
+ "category": "general_symptom"
145
+ },
146
+ {
147
+ "id": "test_006",
148
+ "query": "pulmonary embolism treatment guidelines",
149
+ "description": "Specific condition with treatment focus",
150
+ "expected_condition": "pulmonary embolism",
151
+ "expected_mechanism": "predefined_mapping",
152
+ "category": "pulmonary_emergency"
153
+ }
154
+ ]
155
+
156
+ def run_single_test(self, test_case: Dict[str, Any]) -> Dict[str, Any]:
157
+ """Execute a single test case with comprehensive analysis"""
158
+ test_id = test_case["id"]
159
+ query = test_case["query"]
160
+
161
+ print(f"\n🔍 {test_id}: {test_case['description']}")
162
+ print(f"Query: '{query}'")
163
+ print("-" * 60)
164
+
165
+ result = {
166
+ "test_id": test_id,
167
+ "test_case": test_case,
168
+ "timestamp": datetime.now().isoformat(),
169
+ "success": False,
170
+ "error": None,
171
+ "execution_time": 0,
172
+ "steps": {}
173
+ }
174
+
175
+ start_time = datetime.now()
176
+
177
+ try:
178
+ # Step 1: Condition Extraction
179
+ print("Step 1: Extracting medical condition and keywords...")
180
+ condition_start = datetime.now()
181
+
182
+ condition_result = self.user_prompt_processor.extract_condition_keywords(query)
183
+ condition_time = (datetime.now() - condition_start).total_seconds()
184
+
185
+ result["steps"]["condition_extraction"] = {
186
+ "duration_seconds": condition_time,
187
+ "condition": condition_result.get('condition', ''),
188
+ "emergency_keywords": condition_result.get('emergency_keywords', ''),
189
+ "treatment_keywords": condition_result.get('treatment_keywords', ''),
190
+ "confidence": condition_result.get('confidence', 'unknown'),
191
+ "source": self._determine_extraction_source(condition_result)
192
+ }
193
+
194
+ print(f" Condition: {condition_result.get('condition', 'None')}")
195
+ print(f" Emergency keywords: {condition_result.get('emergency_keywords', 'None')}")
196
+ print(f" Treatment keywords: {condition_result.get('treatment_keywords', 'None')}")
197
+ print(f" Source: {result['steps']['condition_extraction']['source']}")
198
+ print(f" Duration: {condition_time:.3f}s")
199
+
200
+ # Step 2: User Confirmation (Simulated)
201
+ print("\nStep 2: User confirmation process...")
202
+ confirmation_result = self.user_prompt_processor.handle_user_confirmation(condition_result)
203
+
204
+ result["steps"]["user_confirmation"] = {
205
+ "confirmation_type": confirmation_result.get('type', 'unknown'),
206
+ "message_length": len(confirmation_result.get('message', '')),
207
+ "actionable": confirmation_result.get('type') == 'confirmation_needed'
208
+ }
209
+
210
+ print(f" Confirmation type: {confirmation_result.get('type', 'Unknown')}")
211
+
212
+ # Step 3: Retrieval Execution
213
+ if condition_result.get('condition'):
214
+ print("\nStep 3: Executing retrieval...")
215
+ retrieval_start = datetime.now()
216
+
217
+ # Construct search query
218
+ search_query = self._construct_search_query(condition_result)
219
+
220
+ # Perform retrieval
221
+ retrieval_results = self.retrieval_system.search(search_query, top_k=5)
222
+ retrieval_time = (datetime.now() - retrieval_start).total_seconds()
223
+
224
+ # Correctly count emergency and treatment results from processed_results
225
+ processed_results = retrieval_results.get('processed_results', [])
226
+ emergency_count = len([r for r in processed_results if r.get('type') == 'emergency'])
227
+ treatment_count = len([r for r in processed_results if r.get('type') == 'treatment'])
228
+
229
+ result["steps"]["retrieval"] = {
230
+ "duration_seconds": retrieval_time,
231
+ "search_query": search_query,
232
+ "total_results": retrieval_results.get('total_results', 0),
233
+ "emergency_results": emergency_count,
234
+ "treatment_results": treatment_count,
235
+ "processed_results": len(processed_results),
236
+ "duplicates_removed": retrieval_results.get('processing_info', {}).get('duplicates_removed', 0)
237
+ }
238
+
239
+ print(f" Search query: '{search_query}'")
240
+ print(f" Total results: {result['steps']['retrieval']['total_results']}")
241
+ print(f" Emergency results: {emergency_count}")
242
+ print(f" Treatment results: {treatment_count}")
243
+ print(f" Duration: {retrieval_time:.3f}s")
244
+
245
+ # Analyze top results
246
+ if 'processed_results' in retrieval_results and retrieval_results['processed_results']:
247
+ top_results = retrieval_results['processed_results'][:3]
248
+ result["steps"]["top_results_analysis"] = []
249
+
250
+ print(f"\n Top {len(top_results)} results:")
251
+ for i, res in enumerate(top_results, 1):
252
+ analysis = {
253
+ "rank": i,
254
+ "type": res.get('type', 'unknown'),
255
+ "distance": res.get('distance', 999),
256
+ "text_length": len(res.get('text', '')),
257
+ "has_matched_keywords": bool(res.get('matched', '')),
258
+ "has_treatment_keywords": bool(res.get('matched_treatment', ''))
259
+ }
260
+ result["steps"]["top_results_analysis"].append(analysis)
261
+
262
+ print(f" {i}. Type: {analysis['type']}, Distance: {analysis['distance']:.4f}")
263
+ print(f" Text preview: {res.get('text', '')[:100]}...")
264
+ if res.get('matched'):
265
+ print(f" Matched: {res.get('matched')}")
266
+ if res.get('matched_treatment'):
267
+ print(f" Treatment: {res.get('matched_treatment')}")
268
+
269
+ else:
270
+ print("\nStep 3: Skipping retrieval (no condition extracted)")
271
+ result["steps"]["retrieval"] = {
272
+ "skipped": True,
273
+ "reason": "no_condition_extracted"
274
+ }
275
+
276
+ # Calculate total execution time
277
+ total_time = (datetime.now() - start_time).total_seconds()
278
+ result["execution_time"] = total_time
279
+ result["success"] = True
280
+
281
+ print(f"\n✅ Test {test_id} completed successfully ({total_time:.3f}s)")
282
+
283
+ except Exception as e:
284
+ total_time = (datetime.now() - start_time).total_seconds()
285
+ result["execution_time"] = total_time
286
+ result["error"] = str(e)
287
+ result["traceback"] = traceback.format_exc()
288
+
289
+ logger.error(f"Test {test_id} failed: {e}")
290
+ print(f"❌ Test {test_id} failed: {e}")
291
+
292
+ return result
293
+
294
+ def _determine_extraction_source(self, condition_result: Dict) -> str:
295
+ """Determine how the condition was extracted"""
296
+ if condition_result.get('semantic_confidence') is not None:
297
+ return "semantic_search"
298
+ elif condition_result.get('generic_confidence') is not None:
299
+ return "generic_search"
300
+ elif condition_result.get('condition') in CONDITION_KEYWORD_MAPPING:
301
+ return "predefined_mapping"
302
+ else:
303
+ return "llm_extraction"
304
+
305
+ def _construct_search_query(self, condition_result: Dict) -> str:
306
+ """Construct search query from condition result"""
307
+ emergency_kws = condition_result.get('emergency_keywords', '')
308
+ treatment_kws = condition_result.get('treatment_keywords', '')
309
+
310
+ search_parts = []
311
+ if emergency_kws:
312
+ search_parts.append(emergency_kws)
313
+ if treatment_kws:
314
+ search_parts.append(treatment_kws)
315
+
316
+ if search_parts:
317
+ return ' '.join(search_parts)
318
+ else:
319
+ return condition_result.get('condition', 'medical emergency')
320
+
321
+ def run_all_tests(self):
322
+ """Execute all test cases and generate comprehensive report"""
323
+ if not self.components_initialized:
324
+ print("❌ Cannot run tests: components not initialized")
325
+ return
326
+
327
+ test_cases = self.get_test_queries()
328
+
329
+ print(f"\n🚀 Starting Comprehensive Pipeline Test")
330
+ print(f"Total test cases: {len(test_cases)}")
331
+ print(f"Test started at: {self.start_time.strftime('%Y-%m-%d %H:%M:%S')}")
332
+ print("=" * 80)
333
+
334
+ # Execute all tests
335
+ for test_case in test_cases:
336
+ result = self.run_single_test(test_case)
337
+ self.results.append(result)
338
+
339
+ # Generate comprehensive report
340
+ self.generate_test_report()
341
+ self.save_test_results()
342
+
343
+ def generate_test_report(self):
344
+ """Generate detailed test report with statistics and analysis"""
345
+ end_time = datetime.now()
346
+ total_duration = (end_time - self.start_time).total_seconds()
347
+
348
+ successful_tests = [r for r in self.results if r['success']]
349
+ failed_tests = [r for r in self.results if not r['success']]
350
+
351
+ print("\n" + "=" * 80)
352
+ print("📊 COMPREHENSIVE TEST REPORT")
353
+ print("=" * 80)
354
+
355
+ # Summary Statistics
356
+ print(f"🕐 Execution Summary:")
357
+ print(f" Start time: {self.start_time.strftime('%Y-%m-%d %H:%M:%S')}")
358
+ print(f" End time: {end_time.strftime('%Y-%m-%d %H:%M:%S')}")
359
+ print(f" Total duration: {total_duration:.3f}s")
360
+ print(f" Average per test: {total_duration/len(self.results):.3f}s")
361
+
362
+ print(f"\n📈 Test Results:")
363
+ print(f" Total tests: {len(self.results)}")
364
+ print(f" Successful: {len(successful_tests)} ✅")
365
+ print(f" Failed: {len(failed_tests)} ❌")
366
+ print(f" Success rate: {len(successful_tests)/len(self.results)*100:.1f}%")
367
+
368
+ # Detailed Analysis
369
+ if successful_tests:
370
+ print(f"\n✅ Successful Tests Analysis:")
371
+
372
+ # Analyze extraction sources
373
+ source_counts = {}
374
+ total_retrieval_time = 0
375
+ total_condition_time = 0
376
+ retrieval_count = 0
377
+
378
+ for result in successful_tests:
379
+ if 'condition_extraction' in result['steps']:
380
+ source = result['steps']['condition_extraction']['source']
381
+ source_counts[source] = source_counts.get(source, 0) + 1
382
+ total_condition_time += result['steps']['condition_extraction']['duration_seconds']
383
+
384
+ if 'retrieval' in result['steps'] and not result['steps']['retrieval'].get('skipped'):
385
+ total_retrieval_time += result['steps']['retrieval']['duration_seconds']
386
+ retrieval_count += 1
387
+
388
+ print(f" Condition extraction sources:")
389
+ for source, count in source_counts.items():
390
+ print(f" - {source}: {count} tests")
391
+
392
+ print(f" Performance metrics:")
393
+ print(f" - Avg condition extraction: {total_condition_time/len(successful_tests):.3f}s")
394
+ if retrieval_count > 0:
395
+ print(f" - Avg retrieval time: {total_retrieval_time/retrieval_count:.3f}s")
396
+
397
+ # Individual test details
398
+ for result in successful_tests:
399
+ test_case = result['test_case']
400
+ print(f"\n 📋 {result['test_id']}: {test_case['description']}")
401
+ print(f" Query: '{test_case['query']}'")
402
+
403
+ if 'condition_extraction' in result['steps']:
404
+ ce = result['steps']['condition_extraction']
405
+ print(f" Condition: {ce['condition']}")
406
+ print(f" Source: {ce['source']}")
407
+
408
+ if 'retrieval' in result['steps'] and not result['steps']['retrieval'].get('skipped'):
409
+ ret = result['steps']['retrieval']
410
+ print(f" Results: {ret['total_results']} total ({ret['emergency_results']} emergency, {ret['treatment_results']} treatment)")
411
+
412
+ print(f" Duration: {result['execution_time']:.3f}s")
413
+
414
+ # Failed Tests Analysis
415
+ if failed_tests:
416
+ print(f"\n❌ Failed Tests Analysis:")
417
+ for result in failed_tests:
418
+ test_case = result['test_case']
419
+ print(f" {result['test_id']}: {test_case['description']}")
420
+ print(f" Error: {result['error']}")
421
+ print(f" Duration: {result['execution_time']:.3f}s")
422
+
423
+ print("\n" + "=" * 80)
424
+
425
+ def save_test_results(self):
426
+ """Save detailed test results to JSON file"""
427
+ timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
428
+ filename = project_root / 'tests' / f'pipeline_test_results_{timestamp}.json'
429
+
430
+ try:
431
+ comprehensive_results = {
432
+ "test_metadata": {
433
+ "timestamp": datetime.now().isoformat(),
434
+ "start_time": self.start_time.isoformat(),
435
+ "total_duration_seconds": (datetime.now() - self.start_time).total_seconds(),
436
+ "total_tests": len(self.results),
437
+ "successful_tests": len([r for r in self.results if r['success']]),
438
+ "failed_tests": len([r for r in self.results if not r['success']])
439
+ },
440
+ "test_results": self.results,
441
+ "component_versions": {
442
+ "user_prompt_processor": "1.0.0",
443
+ "retrieval_system": "1.0.0",
444
+ "llm_client": "1.0.0"
445
+ }
446
+ }
447
+
448
+ with open(filename, 'w', encoding='utf-8') as f:
449
+ json.dump(comprehensive_results, f, indent=2, ensure_ascii=False)
450
+
451
+ print(f"📁 Comprehensive test results saved to: {filename}")
452
+
453
+ except Exception as e:
454
+ logger.error(f"Failed to save test results: {e}")
455
+ print(f"⚠️ Failed to save test results: {e}")
456
+
457
+ def main():
458
+ """Main execution function"""
459
+ print("🏥 OnCall.ai Medical Query Processing Pipeline Test")
460
+ print("=" * 60)
461
+
462
+ # Initialize test suite
463
+ test_suite = MedicalQueryPipelineTest()
464
+
465
+ # Initialize components
466
+ test_suite.initialize_components()
467
+
468
+ if not test_suite.components_initialized:
469
+ print("❌ Test suite initialization failed. Exiting.")
470
+ return 1
471
+
472
+ # Run all tests
473
+ test_suite.run_all_tests()
474
+
475
+ return 0
476
+
477
+ if __name__ == "__main__":
478
+ exit_code = main()
479
+ sys.exit(exit_code)