Jeff Ma commited on
Commit
093cf0a
·
2 Parent(s): 940c6d1 16ee1e5

Merge pull request #9 from YanBoChen0928/newbranchYB-rollback

Browse files
app.py CHANGED
@@ -136,11 +136,16 @@ class OnCallAIInterface:
136
  processing_steps.append(f" ⏱️ Processing Time: {step1_time:.3f}s")
137
 
138
  # Handle non-medical queries
139
- if condition_result.get('type') == 'invalid_query':
140
  non_medical_msg = condition_result.get('message', 'This appears to be a non-medical query.')
141
  processing_steps.append(" 🚫 Query identified as non-medical")
142
  return non_medical_msg, '\n'.join(processing_steps), "{}"
143
 
 
 
 
 
 
144
  # STEP 2: User Confirmation (Auto-simulated)
145
  processing_steps.append("\n🤝 Step 2: User confirmation (auto-confirmed for demo)")
146
  confirmation = self.user_prompt_processor.handle_user_confirmation(condition_result)
 
136
  processing_steps.append(f" ⏱️ Processing Time: {step1_time:.3f}s")
137
 
138
  # Handle non-medical queries
139
+ if condition_result.get('query_status') in ['invalid_query', 'non_medical']:
140
  non_medical_msg = condition_result.get('message', 'This appears to be a non-medical query.')
141
  processing_steps.append(" 🚫 Query identified as non-medical")
142
  return non_medical_msg, '\n'.join(processing_steps), "{}"
143
 
144
+ # Handle medical query with no specific condition
145
+ if condition_result.get('query_status') == 'medical_no_condition':
146
+ processing_steps.append(" ℹ️ Medical query confirmed, no specific condition extracted")
147
+ # Continue with standard processing
148
+
149
  # STEP 2: User Confirmation (Auto-simulated)
150
  processing_steps.append("\n🤝 Step 2: User confirmation (auto-confirmed for demo)")
151
  confirmation = self.user_prompt_processor.handle_user_confirmation(condition_result)
dataset/keywords/special_terms_emergency.json CHANGED
@@ -1,26 +1,31 @@
1
  {
2
- "cardiac": {
3
- "mi": ["mi", "m.i.", "myocardial infarction", "MI"],
4
- "acs": ["acs", "ACS", "acute coronary syndrome"]
5
- },
6
- "respiratory": {
7
- "ards": ["ards", "ARDS", "acute respiratory distress syndrome"],
8
- "respiratory_failure": ["respiratory failure", "resp failure", "RF"]
9
- },
10
- "neurological": {
11
- "loc": ["loc", "LOC", "loss of consciousness"],
12
- "cva": ["cva", "CVA", "stroke", "cerebrovascular accident"]
13
- },
14
- "shock": {
15
- "shock": ["shock", "circulatory failure"],
16
- "septic_shock": ["septic shock", "sepsis induced shock"]
17
- },
18
- "bleeding": {
19
- "gi_bleed": ["gi bleed", "gi bleeding", "gastrointestinal hemorrhage", "GI hemorrhage"],
20
- "hemorrhage": ["hemorrhage", "bleeding", "blood loss"]
21
- },
22
- "vital_signs": {
23
- "hypotension": ["hypotension", "low bp", "low blood pressure"],
24
- "tachycardia": ["tachycardia", "elevated heart rate", "fast heart rate"]
25
- }
26
- }
 
 
 
 
 
 
1
  {
2
+ "cardiac": {
3
+ "mi": ["mi", "m.i.", "myocardial infarction", "MI", "STEMI", "NSTEMI"],
4
+ "acs": ["acs", "ACS", "acute coronary syndrome"]
5
+ },
6
+ "respiratory": {
7
+ "ards": ["ards", "ARDS", "acute respiratory distress syndrome"],
8
+ "respiratory_failure": ["respiratory failure", "resp failure", "RF"]
9
+ },
10
+ "neurological": {
11
+ "loc": ["loc", "LOC", "loss of consciousness"],
12
+ "cva": ["cva", "CVA", "stroke", "cerebrovascular accident"]
13
+ },
14
+ "shock": {
15
+ "shock": ["shock", "circulatory failure"],
16
+ "septic_shock": ["septic shock", "sepsis induced shock"]
17
+ },
18
+ "bleeding": {
19
+ "gi_bleed": [
20
+ "gi bleed",
21
+ "gi bleeding",
22
+ "gastrointestinal hemorrhage",
23
+ "GI hemorrhage"
24
+ ],
25
+ "hemorrhage": ["hemorrhage", "bleeding", "blood loss"]
26
+ },
27
+ "vital_signs": {
28
+ "hypotension": ["hypotension", "low bp", "low blood pressure"],
29
+ "tachycardia": ["tachycardia", "elevated heart rate", "fast heart rate"]
30
+ }
31
+ }
evaluation/evaluation_instruction.md ADDED
@@ -0,0 +1,282 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Model use
2
+ llm model: (for comparison) with our-own version.
3
+ https://huggingface.co/aaditya/Llama3-OpenBioLLM-70B
4
+ https://huggingface.co/m42-health/Llama3-Med42-70B
5
+
6
+ evaluation model:
7
+ https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct
8
+
9
+ ```python
10
+ """
11
+ 參閱 user_query.txt
12
+ """
13
+ ```
14
+
15
+
16
+ ### 評估執行流程
17
+ ```python
18
+ def run_complete_evaluation(model_name: str, test_cases: List[str]) -> Dict[str, Any]:
19
+ """執行完整的六項指標評估"""
20
+
21
+ results = {
22
+ "model": model_name,
23
+ "metrics": {},
24
+ "detailed_results": []
25
+ }
26
+
27
+ total_latencies = []
28
+ extraction_successes = []
29
+ relevance_scores = []
30
+ coverage_scores = []
31
+ actionability_scores = []
32
+ evidence_scores = []
33
+
34
+ for query in test_cases:
35
+ # 運行模型並測量所有指標
36
+ start_time = time.time()
37
+
38
+ # 1. 總處理時長
39
+ latency_result = measure_total_latency(query)
40
+ total_latencies.append(latency_result['total_latency'])
41
+
42
+ # 2. 條件抽取成功率
43
+ extraction_result = evaluate_condition_extraction([query])
44
+ extraction_successes.append(extraction_result['success_rate'])
45
+
46
+ # 3 & 4. 檢索相關性和覆蓋率(需要實際檢索結果)
47
+ retrieval_results = get_retrieval_results(query)
48
+ relevance_result = evaluate_retrieval_relevance(retrieval_results)
49
+ relevance_scores.append(relevance_result['average_relevance'])
50
+
51
+ generated_advice = get_generated_advice(query, retrieval_results)
52
+ coverage_result = evaluate_retrieval_coverage(generated_advice, retrieval_results)
53
+ coverage_scores.append(coverage_result['coverage'])
54
+
55
+ # 5 & 6. LLM 評估(需要完整回應)
56
+ response_data = {
57
+ 'query': query,
58
+ 'advice': generated_advice,
59
+ 'retrieval_results': retrieval_results
60
+ }
61
+
62
+ actionability_result = evaluate_clinical_actionability([response_data])
63
+ actionability_scores.append(actionability_result[0]['overall_score'])
64
+
65
+ evidence_result = evaluate_clinical_evidence([response_data])
66
+ evidence_scores.append(evidence_result[0]['overall_score'])
67
+
68
+ # 記錄詳細結果
69
+ results["detailed_results"].append({
70
+ "query": query,
71
+ "latency": latency_result,
72
+ "extraction": extraction_result,
73
+ "relevance": relevance_result,
74
+ "coverage": coverage_result,
75
+ "actionability": actionability_result[0],
76
+ "evidence": evidence_result[0]
77
+ })
78
+
79
+ # 計算平均指標
80
+ results["metrics"] = {
81
+ "average_latency": sum(total_latencies) / len(total_latencies),
82
+ "extraction_success_rate": sum(extraction_successes) / len(extraction_successes),
83
+ "average_relevance": sum(relevance_scores) / len(relevance_scores),
84
+ "average_coverage": sum(coverage_scores) / len(coverage_scores),
85
+ "average_actionability": sum(actionability_scores) / len(actionability_scores),
86
+ "average_evidence_score": sum(evidence_scores) / len(evidence_scores)
87
+ }
88
+
89
+ return results
90
+ ```
91
+
92
+ ---
93
+
94
+ ## 📈 評估結果分析框架
95
+
96
+ ### 統計分析
97
+ ```python
98
+ def analyze_evaluation_results(results_A: Dict, results_B: Dict, results_C: Dict) -> Dict:
99
+ """比較三個模型的評估結果"""
100
+
101
+ models = ['Med42-70B_direct', 'RAG_enhanced', 'OpenBioLLM-70B']
102
+ metrics = ['latency', 'extraction_success_rate', 'relevance', 'coverage', 'actionability', 'evidence_score']
103
+
104
+ comparison = {}
105
+
106
+ for metric in metrics:
107
+ comparison[metric] = {
108
+ models[0]: results_A['metrics'][f'average_{metric}'],
109
+ models[1]: results_B['metrics'][f'average_{metric}'],
110
+ models[2]: results_C['metrics'][f'average_{metric}']
111
+ }
112
+
113
+ # 計算相對改進
114
+ baseline = comparison[metric][models[0]]
115
+ rag_improvement = ((comparison[metric][models[1]] - baseline) / baseline) * 100
116
+
117
+ comparison[metric]['rag_improvement_percent'] = rag_improvement
118
+
119
+ return comparison
120
+ ```
121
+
122
+ ### 報告生成
123
+ ```python
124
+ def generate_evaluation_report(comparison_results: Dict) -> str:
125
+ """生成評估報告"""
126
+
127
+ report = f"""
128
+ # OnCall.ai 系統評估報告
129
+
130
+ ## 評估摘要
131
+
132
+ | 指標 | Med42-70B | RAG增強版 | OpenBioLLM | RAG改進% |
133
+ |------|-----------|-----------|------------|----------|
134
+ | 處理時長 | {comparison_results['latency']['Med42-70B_direct']:.2f}s | {comparison_results['latency']['RAG_enhanced']:.2f}s | {comparison_results['latency']['OpenBioLLM-70B']:.2f}s | {comparison_results['latency']['rag_improvement_percent']:+.1f}% |
135
+ | 條件抽取成功率 | {comparison_results['extraction_success_rate']['Med42-70B_direct']:.1%} | {comparison_results['extraction_success_rate']['RAG_enhanced']:.1%} | {comparison_results['extraction_success_rate']['OpenBioLLM-70B']:.1%} | {comparison_results['extraction_success_rate']['rag_improvement_percent']:+.1f}% |
136
+ | 檢索相關性 | - | {comparison_results['relevance']['RAG_enhanced']:.3f} | - | - |
137
+ | 檢索覆蓋率 | - | {comparison_results['coverage']['RAG_enhanced']:.1%} | - | - |
138
+ | 臨床可操作性 | {comparison_results['actionability']['Med42-70B_direct']:.1f}/10 | {comparison_results['actionability']['RAG_enhanced']:.1f}/10 | {comparison_results['actionability']['OpenBioLLM-70B']:.1f}/10 | {comparison_results['actionability']['rag_improvement_percent']:+.1f}% |
139
+ | 臨床證據評分 | {comparison_results['evidence_score']['Med42-70B_direct']:.1f}/10 | {comparison_results['evidence_score']['RAG_enhanced']:.1f}/10 | {comparison_results['evidence_score']['OpenBioLLM-70B']:.1f}/10 | {comparison_results['evidence_score']['rag_improvement_percent']:+.1f}% |
140
+
141
+ """
142
+
143
+ return report
144
+ ```
145
+
146
+ ---
147
+
148
+ ## 🔧 實驗執行步驟
149
+
150
+ ### 1. 環境準備
151
+ ```bash
152
+ # 設置 HuggingFace token(用於 Inference Providers)
153
+ export HF_TOKEN=your_huggingface_token
154
+
155
+ # 設置評估模式
156
+ export ONCALL_EVAL_MODE=true
157
+ ```
158
+
159
+ ### 2. 實驗執行腳本框架
160
+ ```python
161
+ # evaluation/run_evaluation.py
162
+ def main():
163
+ """主要評估執行函數"""
164
+
165
+ # 加載測試用例
166
+ test_cases = MEDICAL_TEST_CASES
167
+
168
+ # 實驗 A: YanBo 系統評估
169
+ print("🔬 開始實驗 A: YanBo 系統評估")
170
+ results_med42_direct = run_complete_evaluation("Med42-70B_direct", test_cases)
171
+ results_general_rag = run_complete_evaluation("Med42-70B_general_RAG", test_cases)
172
+ results_openbio = run_complete_evaluation("OpenBioLLM-70B", test_cases)
173
+
174
+ # 分析和報告
175
+ comparison_A = analyze_evaluation_results(results_med42_direct, results_general_rag, results_openbio)
176
+ report_A = generate_evaluation_report(comparison_A)
177
+
178
+ # 保存結果
179
+ save_results("evaluation/results/yanbo_evaluation.json", {
180
+ "comparison": comparison_A,
181
+ "detailed_results": [results_med42_direct, results_general_rag, results_openbio]
182
+ })
183
+
184
+ print("✅ 實驗 A 完成,結果已保存")
185
+
186
+ # 實驗 B: Jeff 系統評估
187
+ print("🔬 開始實驗 B: Jeff 系統評估")
188
+ results_med42_direct_b = run_complete_evaluation("Med42-70B_direct", test_cases)
189
+ results_customized_rag = run_complete_evaluation("Med42-70B_customized_RAG", test_cases)
190
+ results_openbio_b = run_complete_evaluation("OpenBioLLM-70B", test_cases)
191
+
192
+ # 分析和報告
193
+ comparison_B = analyze_evaluation_results(results_med42_direct_b, results_customized_rag, results_openbio_b)
194
+ report_B = generate_evaluation_report(comparison_B)
195
+
196
+ # 保存結果
197
+ save_results("evaluation/results/jeff_evaluation.json", {
198
+ "comparison": comparison_B,
199
+ "detailed_results": [results_med42_direct_b, results_customized_rag, results_openbio_b]
200
+ })
201
+
202
+ print("✅ 實驗 B 完成,結果已保存")
203
+
204
+ if __name__ == "__main__":
205
+ main()
206
+ ```
207
+
208
+ ### 3. 預期評估時間
209
+ ```
210
+ 總評估時間估算:
211
+ ├── 每個查詢處理時間:~30秒(包含LLM評估)
212
+ ├── 測試用例數量:7個
213
+ ├── 模型數量:3個
214
+ └── 總時間:~10-15分鐘每個實驗
215
+ ```
216
+
217
+ ---
218
+
219
+ ## 📊 評估成功標準
220
+
221
+ ### 系統性能目標
222
+ ```
223
+ ✅ 達標條件:
224
+ 1. 總處理時長 ≤ 30秒
225
+ 2. 條件抽取成功率 ≥ 80%
226
+ 3. 檢索相關性 ≥ 0.2
227
+ 4. 檢索覆蓋率 ≥ 60%
228
+ 5. 臨床可操作性 ≥ 7.0/10
229
+ 6. 臨床證據評分 ≥ 7.5/10
230
+
231
+ 🎯 RAG 系統成功標準:
232
+ - RAG增強版在 4-6 項指標上優於基線 Med42-70B
233
+ - 整體提升幅度 ≥ 10%
234
+ ```
235
+
236
+ ### 比較分析重點
237
+ ```
238
+ 重點分析維度:
239
+ ├── RAG 對處理時間的影響(可能增加延遲)
240
+ ├── RAG 對回答質量的提升(可操作性和證據品質)
241
+ ├── 不同 RAG 策略的效果差異(general vs customized)
242
+ └── 與其他醫學模型的競爭力比較
243
+ ```
244
+
245
+ ---
246
+
247
+ ## 🛠️ 實施建議
248
+
249
+ ### 分階段實施
250
+ ```
251
+ 階段1: 基礎指標實現(1-4項)
252
+ ├── 利用現有 app.py 中的時間測量
253
+ ├── 擴展 user_prompt.py 的條件抽取評估
254
+ ├── 增強 retrieval.py 的相關性分析
255
+ └── 實現 generation.py 的覆蓋率計算
256
+
257
+ 階段2: LLM評估實現(5-6項)
258
+ ├── 設置 HuggingFace Inference Providers
259
+ ├── 實現 Llama3-70B 評估客戶端
260
+ ├── 測試評估 prompts 的穩定性
261
+ └── 建立評估結果解析邏輯
262
+
263
+ 階段3: 完整實驗執行
264
+ ├── 準備標準測試用例
265
+ ├── 執行 YanBo 系統評估(實驗A)
266
+ ├── 執行 Jeff 系統評估(實驗B)
267
+ └── 生成比較分析報告
268
+ ```
269
+
270
+ ### 實施注意事項
271
+ ```
272
+ ⚠️ 重要提醒:
273
+ 1. 所有評估代碼應獨立於現有系統,避免影響正常運行
274
+ 2. LLM 評估可能不穩定,建議多次運行取平均值
275
+ 3. 注意 API 費用控制,特別是 Llama3-70B 調用
276
+ 4. 保存詳細的中間結果,便於調試和分析
277
+ 5. 測試用例應涵蓋���同複雜度和醫學領域
278
+ ```
279
+
280
+ ---
281
+
282
+ **評估指南完成。請根據此指南實施評估實驗。**
evaluation/evaluation_instruction_customization.md ADDED
@@ -0,0 +1,721 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _Chunk
2
+ - Semantic_Similarity_Score = cosine_similarity(generated_text, chunk_text)
3
+ - 使用BGE-Large-Medical模型计算语义相似度
4
+
5
+ # Model
6
+ llm model: (for comparison) with our-own version.
7
+ https://huggingface.co/aaditya/Llama3-OpenBioLLM-70B
8
+ https://huggingface.co/m42-health/Llama3-Med42-70B
9
+
10
+ evaluation model:
11
+ https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct
12
+
13
+ ```python
14
+ """
15
+ 參閱 user_query.txt
16
+ """
17
+ ```
18
+
19
+ # **ASCII 流程图:**
20
+ ```
21
+ 定制化系统覆盖率计算流程:
22
+ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐
23
+ │ 两阶段检索 │───▶│ 生成的 │───▶│ 计算使用 │
24
+ │ 8-15个块 │ │ 医学建议 │ │ 比例 │
25
+ └─────────────┘ └─────────────┘ └─────────────┘
26
+ │ │ │
27
+ ▼ ▼ ▼
28
+ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐
29
+ │ 医院PDF │ │ 医学术语 │ │ Coverage = │
30
+ │ 文档内容 │ │ 匹配分析 │ │ 10/15 = 67% │
31
+ └─────────────┘ └─────────────┘ └─────────────┘
32
+ ```
33
+
34
+ **实现框架:**
35
+ ```python
36
+ # 基于定制化系统的文档块分析
37
+ def evaluate_customization_coverage(generated_advice: str, retrieved_chunks: List[Dict]) -> Dict[str, float]:
38
+ """评估定制化检索覆盖率"""
39
+
40
+ if not retrieved_chunks:
41
+ return {"coverage": 0.0, "used_chunks": 0, "total_chunks": 0}
42
+
43
+ used_chunks = 0
44
+ chunk_usage_details = []
45
+
46
+ # 使用BGE-Large-Medical模型计算语义相似度
47
+ embedding_model = SentenceTransformer("BAAI/bge-large-zh-v1.5") # Jeff系统使用的模型
48
+
49
+ for i, chunk in enumerate(retrieved_chunks):
50
+ chunk_text = chunk.get('chunk_text', '')
51
+ document_name = chunk.get('document', '')
52
+ similarity_score = chunk.get('score', 0.0)
53
+
54
+ # 方法1: 医学术语匹配(基于医院文档特有术语)
55
+ medical_terms = extract_hospital_medical_terms(chunk_text)
56
+ term_matches = count_medical_term_matches(generated_advice, medical_terms)
57
+ term_match_score = term_matches / len(medical_terms) if medical_terms else 0
58
+
59
+ # 方法2: 语义相似度(使用BGE-Large-Medical)
60
+ chunk_embedding = embedding_model.encode([chunk_text])[0]
61
+ advice_embedding = embedding_model.encode([generated_advice])[0]
62
+ semantic_score = cosine_similarity([chunk_embedding], [advice_embedding])[0][0]
63
+
64
+ # 综合评分(考虑原始检索分数)
65
+ usage_score = max(term_match_score, semantic_score, similarity_score)
66
+
67
+ # 阈值判断(使用率 > 0.25 视为已使用,适应医学领域特点)
68
+ is_used = usage_score > 0.25
69
+ if is_used:
70
+ used_chunks += 1
71
+
72
+ chunk_usage_details.append({
73
+ "chunk_id": i,
74
+ "document": document_name,
75
+ "original_score": similarity_score,
76
+ "term_match_score": term_match_score,
77
+ "semantic_score": semantic_score,
78
+ "usage_score": usage_score,
79
+ "is_used": is_used
80
+ })
81
+
82
+ coverage = used_chunks / len(retrieved_chunks)
83
+
84
+ return {
85
+ "coverage": coverage,
86
+ "used_chunks": used_chunks,
87
+ "total_chunks": len(retrieved_chunks),
88
+ "chunk_details": chunk_usage_details,
89
+ "average_original_score": sum(chunk['original_score'] for chunk in chunk_usage_details) / len(chunk_usage_details)
90
+ }
91
+
92
+ def extract_hospital_medical_terms(text: str) -> List[str]:
93
+ """提取医院文档特有的医学术语"""
94
+ # 基于医院临床指南的专业术语库
95
+ hospital_medical_terms = []
96
+ # 实现细节:结合医院特定术语和标准医学词汇
97
+ return hospital_medical_terms
98
+ ```
99
+
100
+ **目标阈值:** ≥ 55%(考虑医院文档的专业性和复杂性)
101
+
102
+ ---
103
+
104
+ ### 5. 临床可操作性(Clinical Actionability)
105
+
106
+ **定义:** 基于医院定制文档生成建议的临床实用性
107
+
108
+ **测量位置:** 独立评估模块,使用 LLM 评估
109
+
110
+ **评估者:** `meta-llama/Llama-3.3-70B-Instruct`
111
+
112
+ **评估 Prompt:**
113
+ ```python
114
+ CUSTOMIZATION_ACTIONABILITY_PROMPT = """
115
+ 你是一位在该医院工作了15年的资深主治医师和临床科室主任。你非常熟悉医院的临床指南、诊疗流程和设备资源情况。
116
+
117
+ 现在有一位年轻医师询问临床问题,系统基于医院的内部文档给出了建议。请你评估这个建议在本医院环境下的临床可操作性。
118
+
119
+ 【原始临床问题】
120
+ {original_query}
121
+
122
+ 【基于医院文档的建议】
123
+ {medical_advice}
124
+
125
+ 【引用的医院文档片段】
126
+ {hospital_document_chunks}
127
+
128
+ 【评估标准】
129
+ 请从以下四个维度评估在医院环境下的临床可操作性(每项 1-10 分):
130
+
131
+ 1. **医院资源匹配度 (Hospital Resource Compatibility)**
132
+ - 建议所需的设备、药物、检查是否在医院可获得?
133
+ - 是否符合医院的实际诊疗能力和资源配置?
134
+ - 评分:1-10 分
135
+
136
+ 2. **医院流程一致性 (Hospital Process Consistency)**
137
+ - 建议是否符合医院的标准诊疗流程?
138
+ - 是否与医院的临床路径和指南一致?
139
+ - 评分:1-10 分
140
+
141
+ 3. **实施可行性 (Implementation Feasibility)**
142
+ - 建议是否可在医院当前条件下立即实施?
143
+ - 涉及的科室协作和流程是否现实可行?
144
+ - 评分:1-10 分
145
+
146
+ 4. **医院特色适应性 (Hospital-Specific Adaptation)**
147
+ - 建议是否体现了医院的专科优势和特色?
148
+ - 是否考虑了医院的患者群体特点?
149
+ - 评分:1-10 分
150
+
151
+ 【回答格式】
152
+ 请严格按照以下 JSON 格式回答:
153
+
154
+ ```json
155
+ {
156
+ "resource_compatibility_score": <1-10整数>,
157
+ "process_consistency_score": <1-10整数>,
158
+ "implementation_feasibility_score": <1-10整数>,
159
+ "hospital_adaptation_score": <1-10整数>,
160
+ "overall_actionability_score": <四项平均分,保留一位小数>,
161
+ "detailed_feedback": "<详细说明各项评分理由,特别指出建议与医院实际情况的匹配程度>"
162
+ }
163
+ ```
164
+
165
+ 请记住:作为本医院的资深医师,你的评估应该基于医院的实际情况和资源能力。
166
+ """
167
+ ```
168
+
169
+ **计算公式:**
170
+ ```
171
+ Hospital_Actionability = (Resource_Compatibility + Process_Consistency + Implementation_Feasibility + Hospital_Adaptation) / 4
172
+
173
+ 医院定制化评分标准:
174
+ - 9-10 分:完全适合本医院,可立即实施
175
+ - 7-8 分:高度适合,需要少量调整
176
+ - 5-6 分:基本适合,需要一定适配
177
+ - 3-4 分:部分适合,需要显著修改
178
+ - 1-2 分:不适合本医院环境
179
+ ```
180
+
181
+ **实现框架:**
182
+ ```python
183
+ def evaluate_customization_actionability(responses: List[Dict]) -> List[Dict]:
184
+ """评估医院定制化系统的临床可操作性"""
185
+ evaluator_client = HuggingFaceInferenceClient(
186
+ model="meta-llama/Llama-3.3-70B-Instruct",
187
+ provider="sambanova"
188
+ )
189
+
190
+ actionability_results = []
191
+
192
+ for response in responses:
193
+ # 格式化医院文档片段
194
+ hospital_chunks = format_hospital_document_chunks(response.get('retrieval_results', []))
195
+
196
+ prompt = CUSTOMIZATION_ACTIONABILITY_PROMPT.format(
197
+ original_query=response['query'],
198
+ medical_advice=response['advice'],
199
+ hospital_document_chunks=hospital_chunks
200
+ )
201
+
202
+ # 调用 Llama3-70B 评估
203
+ evaluation = evaluator_client.chat_completion(
204
+ messages=[{"role": "user", "content": prompt}],
205
+ temperature=0.1,
206
+ max_tokens=1000
207
+ )
208
+
209
+ # 解析 JSON 响应
210
+ try:
211
+ scores = json.loads(evaluation.choices[0].message.content)
212
+ actionability_results.append({
213
+ "model": response['model'],
214
+ "actionability_scores": scores,
215
+ "overall_score": scores['overall_actionability_score']
216
+ })
217
+ except json.JSONDecodeError:
218
+ actionability_results.append({
219
+ "model": response['model'],
220
+ "error": "Failed to parse evaluation",
221
+ "overall_score": 0.0
222
+ })
223
+
224
+ return actionability_results
225
+
226
+ def format_hospital_document_chunks(retrieval_results: List[Dict]) -> str:
227
+ """格式化医院文档片段用于评估"""
228
+ if not retrieval_results:
229
+ return "未找到相关医院文档"
230
+
231
+ chunks_text = ""
232
+ for i, result in enumerate(retrieval_results[:5], 1):
233
+ doc_name = result.get('document', '未知文档')
234
+ chunk_content = result.get('chunk_text', '')
235
+ similarity = result.get('score', 0.0)
236
+
237
+ chunks_text += f"""
238
+ 【医院文档 {i}】
239
+ 文档来源:{doc_name}
240
+ 相关性:{similarity:.3f}
241
+ 内容:{chunk_content}
242
+
243
+ """
244
+
245
+ return chunks_text.strip()
246
+ ```
247
+
248
+ **目标阈值:** ≥ 7.5 分
249
+
250
+ ---
251
+
252
+ ### 6. 临床证据评分(Clinical Evidence Score)
253
+
254
+ **定义:** 基于医院文档的建议的证据可靠性和科学性
255
+
256
+ **测量位置:** 独立评估模块,使用 LLM 评估
257
+
258
+ **评估者:** `meta-llama/Llama-3.3-70B-Instruct`
259
+
260
+ **评估 Prompt:**
261
+ ```python
262
+ CUSTOMIZATION_EVIDENCE_PROMPT = """
263
+ 你是一位医院医务处的循证医学专家和临床指南审查委员。你负责审核和更新医院的各种临床指南和诊疗规范,对医院内部文档的证据质量有深入了解。
264
+
265
+ 现在需要你评估一个基于医院内部文档生成的医学建议的证据基础品质。
266
+
267
+ 【原始临床问题】
268
+ {original_query}
269
+
270
+ 【基于医院文档的建��】
271
+ {medical_advice}
272
+
273
+ 【引用的医院文档内容】
274
+ {hospital_document_sources}
275
+
276
+ 【评估标准】
277
+ 请从以下四个维度评估医院文档证据品质(每项 1-10 分):
278
+
279
+ 1. **医院文档权威性 (Hospital Document Authority)**
280
+ - 引用的医院文档是否为权威临床指南?
281
+ - 文档版本是否为最新且有效的?
282
+ - 评分:1-10 分
283
+
284
+ 2. **证据与建议一致性 (Evidence-Recommendation Consistency)**
285
+ - 提供的建议是否与医院文档内容完全一致?
286
+ - 是否存在与医院指南矛盾的声明?
287
+ - 评分:1-10 分
288
+
289
+ 3. **医院标准符合性 (Hospital Standard Compliance)**
290
+ - 建议是否符合医院的诊疗标准和质量要求?
291
+ - 是否体现了医院的诊疗规范和特色?
292
+ - 评分:1-10 分
293
+
294
+ 4. **文档引用准确性 (Document Citation Accuracy)**
295
+ - 是否准确引用和解读了医院文档内容?
296
+ - 是否避免了对文档内容的误解或扭曲?
297
+ - 评分:1-10 分
298
+
299
+ 【回答格式】
300
+ 请严格按照以下 JSON 格式回答:
301
+
302
+ ```json
303
+ {
304
+ "document_authority_score": <1-10整数>,
305
+ "consistency_score": <1-10整数>,
306
+ "hospital_standard_score": <1-10整数>,
307
+ "citation_accuracy_score": <1-10整数>,
308
+ "overall_evidence_score": <四项平均分,保留一位小数>,
309
+ "detailed_feedback": "<详细说明各项评分理由,特别关注医院文档的使用是否准确和恰当>"
310
+ }
311
+ ```
312
+
313
+ 请记住:作为医院的循证医学专家,你的评估应该确保建议完全符合医院的文档标准和诊疗规范。
314
+ """
315
+ ```
316
+
317
+ **计算公式:**
318
+ ```
319
+ Hospital_Evidence_Score = (Document_Authority + Consistency + Hospital_Standard + Citation_Accuracy) / 4
320
+
321
+ 医院文档证据等级:
322
+ - Level A: 医院权威指南和临床路径
323
+ - Level B: 医院科室诊疗规范和SOP
324
+ - Level C: 医院培训材料和临床经验总结
325
+ - Level D: 外部指南的医院本地化版本
326
+ ```
327
+
328
+ **实现框架:**
329
+ ```python
330
+ def evaluate_customization_evidence(responses: List[Dict]) -> List[Dict]:
331
+ """评估医院定制化系统的证据品质"""
332
+ evaluator_client = HuggingFaceInferenceClient(
333
+ model="meta-llama/Llama-3.3-70B-Instruct",
334
+ provider="sambanova"
335
+ )
336
+
337
+ evidence_results = []
338
+
339
+ for response in responses:
340
+ # 格式化医院文档来源
341
+ hospital_sources = format_hospital_document_sources(response.get('retrieval_results', []))
342
+
343
+ prompt = CUSTOMIZATION_EVIDENCE_PROMPT.format(
344
+ original_query=response['query'],
345
+ medical_advice=response['advice'],
346
+ hospital_document_sources=hospital_sources
347
+ )
348
+
349
+ # 调用 Llama3-70B 评估
350
+ evaluation = evaluator_client.chat_completion(
351
+ messages=[{"role": "user", "content": prompt}],
352
+ temperature=0.1,
353
+ max_tokens=1200
354
+ )
355
+
356
+ # 解析评估结果
357
+ try:
358
+ scores = json.loads(evaluation.choices[0].message.content)
359
+ evidence_results.append({
360
+ "model": response['model'],
361
+ "evidence_scores": scores,
362
+ "overall_score": scores['overall_evidence_score']
363
+ })
364
+ except json.JSONDecodeError:
365
+ evidence_results.append({
366
+ "model": response['model'],
367
+ "error": "Failed to parse evaluation",
368
+ "overall_score": 0.0
369
+ })
370
+
371
+ return evidence_results
372
+
373
+ def format_hospital_document_sources(retrieval_results: List[Dict]) -> str:
374
+ """格式化医院文档来源用于证据评估"""
375
+ if not retrieval_results:
376
+ return "未引用医院文档"
377
+
378
+ sources_text = ""
379
+ for i, result in enumerate(retrieval_results[:5], 1):
380
+ doc_name = result.get('document', '未知文档.pdf')
381
+ chunk_content = result.get('chunk_text', '')
382
+ score = result.get('score', 0.0)
383
+ metadata = result.get('metadata', {})
384
+
385
+ sources_text += f"""
386
+ 【医院文档来源 {i}】
387
+ 文档名称:{doc_name}
388
+ 相关性:{score:.3f}
389
+ 内容片段:{chunk_content}
390
+ 元数据:{metadata}
391
+
392
+ """
393
+
394
+ return sources_text.strip()
395
+ ```
396
+
397
+ **目标阈值:** ≥ 8.0 分(医院内部文档应有更高的证据可靠性)
398
+
399
+ ---
400
+
401
+ ## 🧪 完整评估流程
402
+
403
+ ### 测试用例设计(医院场景)
404
+ ```python
405
+ # 基于医院定制化系统的典型查询
406
+ HOSPITAL_TEST_CASES = [
407
+ # 急诊科常见问题
408
+ "患者胸痛应该如何诊断和处理?",
409
+ "急性心肌梗死的诊断标准是什么?",
410
+ "儿童发热的处理流程?",
411
+
412
+ # 内科专业问题
413
+ "孕妇头晕的鉴别诊断?",
414
+ "老年患者跌倒的风险评估和预防?",
415
+
416
+ # 外科相关问题
417
+ "急性阑尾炎的手术指征?",
418
+ "术前准备的标准流程?"
419
+ ]
420
+ ```
421
+
422
+ ### 评估执行流程
423
+ ```python
424
+ def run_customization_evaluation(model_name: str, test_cases: List[str]) -> Dict[str, Any]:
425
+ """执行定制化系统的完整评估"""
426
+
427
+ results = {
428
+ "model": model_name,
429
+ "metrics": {},
430
+ "detailed_results": []
431
+ }
432
+
433
+ total_latencies = []
434
+ efficiency_scores = []
435
+ relevance_scores = []
436
+ coverage_scores = []
437
+ actionability_scores = []
438
+ evidence_scores = []
439
+
440
+ for query in test_cases:
441
+ # 1. 总处理时长
442
+ latency_result = measure_customization_latency(query)
443
+ total_latencies.append(latency_result['total_latency'])
444
+
445
+ # 2. 两阶段检索效率
446
+ efficiency_result = evaluate_two_stage_efficiency([query])
447
+ efficiency_scores.append(efficiency_result['overall_efficiency'])
448
+
449
+ # 3. 检索相关性(实际数据)
450
+ retrieval_results = get_customization_retrieval_results(query)
451
+ relevance_result = evaluate_customization_relevance(retrieval_results)
452
+ relevance_scores.append(relevance_result['average_relevance'])
453
+
454
+ # 4. 检索覆盖率
455
+ generated_advice = get_customization_generated_advice(query, retrieval_results)
456
+ coverage_result = evaluate_customization_coverage(generated_advice, retrieval_results)
457
+ coverage_scores.append(coverage_result['coverage'])
458
+
459
+ # 5 & 6. LLM 评估(医院定制化版本)
460
+ response_data = {
461
+ 'query': query,
462
+ 'advice': generated_advice,
463
+ 'retrieval_results': retrieval_results
464
+ }
465
+
466
+ actionability_result = evaluate_customization_actionability([response_data])
467
+ actionability_scores.append(actionability_result[0]['overall_score'])
468
+
469
+ evidence_result = evaluate_customization_evidence([response_data])
470
+ evidence_scores.append(evidence_result[0]['overall_score'])
471
+
472
+ # 记录详细结果
473
+ results["detailed_results"].append({
474
+ "query": query,
475
+ "latency": latency_result,
476
+ "efficiency": efficiency_result,
477
+ "relevance": relevance_result,
478
+ "coverage": coverage_result,
479
+ "actionability": actionability_result[0],
480
+ "evidence": evidence_result[0]
481
+ })
482
+
483
+ # 计算平均指标
484
+ results["metrics"] = {
485
+ "average_latency": sum(total_latencies) / len(total_latencies),
486
+ "average_efficiency": sum(efficiency_scores) / len(efficiency_scores),
487
+ "average_relevance": sum(relevance_scores) / len(relevance_scores),
488
+ "average_coverage": sum(coverage_scores) / len(coverage_scores),
489
+ "average_actionability": sum(actionability_scores) / len(actionability_scores),
490
+ "average_evidence_score": sum(evidence_scores) / len(evidence_scores)
491
+ }
492
+
493
+ return results
494
+ ```
495
+
496
+ ---
497
+
498
+ ## 📈 评估结果分析框架
499
+
500
+ ### 定制化系统特有分析
501
+ ```python
502
+ def analyze_customization_results(results_A: Dict, results_B: Dict, results_C: Dict) -> Dict:
503
+ """比较三个模型在医院定制化场景下的表现"""
504
+
505
+ models = ['OpenBioLLM-70B_direct', 'Jeff_customization_RAG', 'Med42-70B_direct']
506
+ metrics = ['latency', 'efficiency', 'relevance', 'coverage', 'actionability', 'evidence_score']
507
+
508
+ comparison = {}
509
+
510
+ for metric in metrics:
511
+ comparison[metric] = {
512
+ models[0]: results_A['metrics'][f'average_{metric}'],
513
+ models[1]: results_B['metrics'][f'average_{metric}'],
514
+ models[2]: results_C['metrics'][f'average_{metric}']
515
+ }
516
+
517
+ # 计算定制化系统的优势
518
+ baseline_openbio = comparison[metric][models[0]]
519
+ baseline_med42 = comparison[metric][models[2]]
520
+ customization_score = comparison[metric][models[1]]
521
+
522
+ # 相对于两个基线模型的改进
523
+ improvement_vs_openbio = ((customization_score - baseline_openbio) / baseline_openbio) * 100
524
+ improvement_vs_med42 = ((customization_score - baseline_med42) / baseline_med42) * 100
525
+
526
+ comparison[metric]['improvement_vs_openbio'] = improvement_vs_openbio
527
+ comparison[metric]['improvement_vs_med42'] = improvement_vs_med42
528
+
529
+ return comparison
530
+ ```
531
+
532
+ ### 医院定制化报告生成
533
+ ```python
534
+ def generate_customization_report(comparison_results: Dict) -> str:
535
+ """生成医院定制化系统评估报告"""
536
+
537
+ report = f"""
538
+ # 医院定制化RAG系统评估报告
539
+
540
+ ## 评估摘要
541
+
542
+ | 指标 | OpenBioLLM | 定制化系统 | Med42-70B | vs OpenBio | vs Med42 |
543
+ |------|------------|------------|-----------|-----------|----------|
544
+ | 处理时长 | {comparison_results['latency']['OpenBioLLM-70B_direct']:.2f}s | {comparison_results['latency']['Jeff_customization_RAG']:.2f}s | {comparison_results['latency']['Med42-70B_direct']:.2f}s | {comparison_results['latency']['improvement_vs_openbio']:+.1f}% | {comparison_results['latency']['improvement_vs_med42']:+.1f}% |
545
+ | 检索效率 | - | {comparison_results['efficiency']['Jeff_customization_RAG']:.3f} | - | - | - |
546
+ | 检���相关性 | - | {comparison_results['relevance']['Jeff_customization_RAG']:.3f} | - | - | - |
547
+ | 检索覆盖率 | - | {comparison_results['coverage']['Jeff_customization_RAG']:.1%} | - | - | - |
548
+ | 临床可操作性 | {comparison_results['actionability']['OpenBioLLM-70B_direct']:.1f}/10 | {comparison_results['actionability']['Jeff_customization_RAG']:.1f}/10 | {comparison_results['actionability']['Med42-70B_direct']:.1f}/10 | {comparison_results['actionability']['improvement_vs_openbio']:+.1f}% | {comparison_results['actionability']['improvement_vs_med42']:+.1f}% |
549
+ | 临床证据评分 | {comparison_results['evidence_score']['OpenBioLLM-70B_direct']:.1f}/10 | {comparison_results['evidence_score']['Jeff_customization_RAG']:.1f}/10 | {comparison_results['evidence_score']['Med42-70B_direct']:.1f}/10 | {comparison_results['evidence_score']['improvement_vs_openbio']:+.1f}% | {comparison_results['evidence_score']['improvement_vs_med42']:+.1f}% |
550
+
551
+ ## 定制化系统优势分析
552
+
553
+ ### 🏥 医院特定性优势
554
+ - **文档本地化**: 基于医院实际临床指南和诊疗规范
555
+ - **资源匹配度**: 建议完全符合医院设备和人员配置
556
+ - **流程一致性**: 与医院现有诊疗流程高度契合
557
+
558
+ ### ⚡ 技术架构优势
559
+ - **两阶段检索**: 先文档筛选后块检索,精度和效率并重
560
+ - **BGE-Large-Medical**: 医学专用嵌入模型,语义理解更准确
561
+ - **Top-P智能过滤**: 动态质量阈值,避免低质量结果
562
+
563
+ ### 📊 性能表现
564
+ - **检索速度**: 平均{comparison_results['latency']['Jeff_customization_RAG']:.1f}秒响应
565
+ - **检索精度**: {comparison_results['relevance']['Jeff_customization_RAG']:.1%}平均相关性
566
+ - **内容覆盖**: {comparison_results['coverage']['Jeff_customization_RAG']:.1%}文档内容利用率
567
+ """
568
+
569
+ return report
570
+ ```
571
+
572
+ ---
573
+
574
+ ## 🔧 实验执行步骤
575
+
576
+ ### 1. 环境准备
577
+ ```bash
578
+ # 设置 HuggingFace token(用于 Inference Providers)
579
+ export HF_TOKEN=your_huggingface_token
580
+
581
+ # 激活定制化系统环境
582
+ source rag_env/bin/activate
583
+
584
+ # 确保医院文档库已构建
585
+ python customization/customization_pipeline.py
586
+ ```
587
+
588
+ ### 2. Jeff 系统评估脚本
589
+ ```python
590
+ # evaluation/run_customization_evaluation.py
591
+ def main():
592
+ """Jeff 定制化系统评估主函数"""
593
+
594
+ # 加载医院场景测试用例
595
+ test_cases = HOSPITAL_TEST_CASES
596
+
597
+ print("🏥 开始 Jeff 医院定制化系统评估")
598
+
599
+ # 三个模型评估
600
+ results_openbio = run_customization_evaluation("OpenBioLLM-70B_direct", test_cases)
601
+ results_customization = run_customization_evaluation("Jeff_customization_RAG", test_cases)
602
+ results_med42 = run_customization_evaluation("Med42-70B_direct", test_cases)
603
+
604
+ # 分析和报告
605
+ comparison_results = analyze_customization_results(results_openbio, results_customization, results_med42)
606
+ report = generate_customization_report(comparison_results)
607
+
608
+ # 保存结果
609
+ save_results("evaluation/results/jeff_customization_evaluation.json", {
610
+ "comparison": comparison_results,
611
+ "detailed_results": [results_openbio, results_customization, results_med42],
612
+ "hospital_specific_analysis": analyze_hospital_advantages(results_customization)
613
+ })
614
+
615
+ print("✅ Jeff 定制化系统评估完成,结果已保存")
616
+
617
+ # 生成可视化图表
618
+ generate_customization_visualization(comparison_results)
619
+
620
+ def analyze_hospital_advantages(customization_results: Dict) -> Dict:
621
+ """分析医院定制化系统的特有优势"""
622
+ return {
623
+ "document_diversity": len(set([r['document'] for r in customization_results['detailed_results']])),
624
+ "average_hospital_relevance": customization_results['metrics']['average_relevance'],
625
+ "two_stage_effectiveness": customization_results['metrics']['average_efficiency'],
626
+ "hospital_actionability": customization_results['metrics']['average_actionability']
627
+ }
628
+
629
+ if __name__ == "__main__":
630
+ main()
631
+ ```
632
+
633
+ ### 3. 预期评估时间
634
+ ```
635
+ 医院定制化系统评估时间:
636
+ ├── 系统初始化:3-6秒(BGE-Large-Medical加载)
637
+ ├── 每个查询处理:1-3秒(两阶段检索)
638
+ ├── LLM评估:15-25秒/查询
639
+ ├── 测试用例数量:7个
640
+ └── 总评估时间:8-12分钟
641
+ ```
642
+
643
+ ---
644
+
645
+ ## 📊 定制化系统成功标准
646
+
647
+ ### 系统性能目标(调整后)
648
+ ```
649
+ ✅ 达标条件:
650
+ 1. 总处理时长 ≤ 5秒
651
+ 2. 两阶段检索效率 ≥ 0.8
652
+ 3. 检索相关性 ≥ 0.25(基于实际医学数据)
653
+ 4. 检索覆盖率 ≥ 55%
654
+ 5. 临床可操作性 ≥ 7.5/10(医院特定优势)
655
+ 6. 临床证据评分 ≥ 8.0/10(医院文档高可靠性)
656
+
657
+ 🎯 定制化系统成功标准:
658
+ - 在医院特定场景下优于通用医学模型
659
+ - 体现医院文档的本地化优势
660
+ - 检索精度和速度的最佳平衡
661
+ ```
662
+
663
+ ### 关键性能指标(KPI)
664
+ ```
665
+ 医院定制化系统 KPI:
666
+ ├── 检索相关性改进���相比通用模型提升 15-25%
667
+ ├── 医院适用性:可操作性评分 > 7.5分
668
+ ├── 文档利用率:覆盖率 > 55%,体现定制化价值
669
+ ├── 响应速度:< 3秒平均响应时间
670
+ └── 证据可靠性:> 8.0分,基于医院权威文档
671
+ ```
672
+
673
+ ---
674
+
675
+ ## 🛠️ 实施建议
676
+
677
+ ### 分阶段实施(Jeff 系统特定)
678
+ ```
679
+ 阶段1: 基础指标实现(1-4项)
680
+ ├── 利用现有 customization_pipeline.py 的时间测量
681
+ ├── 实现两阶段检索效率评估
682
+ ├── 分析 BGE-Large-Medical 的检索相关性
683
+ └── 实现基于医院文档的覆盖率计算
684
+
685
+ 阶段2: LLM评估实现(5-6项)
686
+ ├── 设置 HuggingFace Inference Providers
687
+ ├── 实现 Llama3-70B 医院特定评估客户端
688
+ ├── 测试医院场景评估 prompts
689
+ └── 建立医院文档引用分析逻辑
690
+
691
+ 阶段3: 完整实验执行
692
+ ├── 准备医院场景测试用例
693
+ ├── 执行 Jeff 定制化系统评估
694
+ ├── 对比通用医学模型表现
695
+ └── 生成医院定制化优势分析报告
696
+ ```
697
+
698
+ ### 医院定制化特有注意事项
699
+ ```
700
+ ⚠️ 重要提醒:
701
+ 1. 确保医院文档库已正确构建和索引
702
+ 2. BGE-Large-Medical 模型需要充足内存(2-4GB)
703
+ 3. 两阶段检索的参数调优(top_p=0.6, min_similarity=0.3)
704
+ 4. 注意医院文档的隐私和安全要求
705
+ 5. 相关性阈值应基于实际医学检索数据(0.25-0.35)
706
+ 6. LLM 评估应强调医院特定性和本地化优势
707
+ ```
708
+
709
+ ### 与 YanBo 系统的区别
710
+ ```
711
+ 核心差异点:
712
+ ├── 数据源:医院内部文档 vs 公开医学指南
713
+ ├── 检索架构:两阶段 ANNOY vs 单阶段向量检索
714
+ ├── 嵌入模型:BGE-Large-Medical vs PubMedBERT
715
+ ├── 优化目标:医院特定性 vs 通用医学知识
716
+ └── 评估重点:本地化适应性 vs 标准医学质量
717
+ ```
718
+
719
+ ---
720
+
721
+ **Jeff 医院定制化系统评估指南完成。请根据此指南实施定制化系统评估实验。**
evaluation/user_query.txt ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 以下是九個以「我在問你」口吻設計的快速諮詢 prompts,分為三類,每類三題:
2
+
3
+
4
+ 1.
5
+ Diagnosis-Focused
6
+ 60-year-old patient with hypertension history, sudden chest pain. What are possible causes and how to assess?
7
+
8
+ 2.
9
+ Treatment-Focused
10
+ Suspected acute ischemic stroke. Tell me the next steps to take
11
+
12
+ 3.
13
+ 20 y/f , porphyria, sudden seizure. What are possible causes and complete management workflow?
14
+
15
+ (測試時可以先用這三題看結果,如果要debug、調整完,再用下面的)
16
+ ---
17
+
18
+ ### 一、Diagnosis-Focused(診斷為主)
19
+
20
+ 1. I have a 68-year-old man with atrial fibrillation presenting with sudden slurred speech and right-sided weakness—what are the possible diagnoses, and how would you evaluate them?
21
+ 2. A 40-year-old woman reports fever, urinary frequency, and dysuria—what differential diagnoses should I consider, and which tests would you order?
22
+ 3. A 50-year-old patient has progressive dyspnea on exertion and orthopnea over two weeks—what are the likely causes, and what diagnostic steps should I take?
23
+
24
+ ### 二、Treatment-Focused(治療為主)
25
+
26
+ 4. ECG shows a suspected acute STEMI—what immediate interventions should I initiate in the next five minutes?
27
+ 5. I have a patient diagnosed with bacterial meningitis—what empiric antibiotic regimen and supportive measures should I implement?
28
+ 6. A patient is in septic shock with BP 80/50 mmHg and HR 120 bpm—what fluid resuscitation and vasopressor strategy would you recommend?
29
+
30
+ ### 三、Mixed(診斷+治療綜合)
31
+
32
+ 7. A 75-year-old diabetic presents with a non-healing foot ulcer and fever—what differential for osteomyelitis, diagnostic workup, and management plan do you suggest?
33
+ 8. A 60-year-old COPD patient has worsening dyspnea and hypercapnia on ABG—how would you confirm the diagnosis, and what is your stepwise treatment approach?
34
+ 9. A 28-year-old woman is experiencing postpartum hemorrhage—what are the possible causes, what immediate resuscitation steps should I take, and how would you proceed with definitive management?
src/generation.py CHANGED
@@ -28,22 +28,22 @@ logging.basicConfig(
28
  )
29
  logger = logging.getLogger(__name__)
30
 
31
- # Fallback Generation Configuration
32
  FALLBACK_TIMEOUTS = {
33
  "primary": 30.0, # Primary Med42-70B with full RAG context
34
- "fallback_1": 15.0, # Simplified Med42-70B without RAG
35
- "fallback_2": 1.0 # RAG template generation (instant)
36
  }
37
 
38
  FALLBACK_TOKEN_LIMITS = {
39
- "primary": 1200, # Full comprehensive medical advice
40
- "fallback_1": 600, # Concise medical guidance
41
- "fallback_2": 0 # Template-based, no LLM tokens
42
  }
43
 
44
  FALLBACK_CONFIDENCE_SCORES = {
45
- "fallback_1": 0.6, # Med42-70B without RAG context
46
- "fallback_2": 0.4 # RAG template only
47
  }
48
 
49
  FALLBACK_ERROR_TRIGGERS = {
@@ -323,11 +323,12 @@ class MedicalAdviceGenerator:
323
  {focus_guidance}
324
 
325
  Provide guidance with:
 
 
326
  • Numbered points (1. 2. 3.) for key steps
327
  • Line breaks between major sections
328
  • Highlight medications with dosages and routes
329
- Reference evidence from above sources
330
- • Emphasize clinical judgment
331
 
332
  IMPORTANT: Keep response under 1000 words. Use concise numbered points. For complex cases with multiple conditions, address the most urgent condition first, then relevant comorbidities. Prioritize actionable clinical steps over theoretical explanations.
333
 
@@ -363,8 +364,9 @@ class MedicalAdviceGenerator:
363
  # Check for API errors in response
364
  if result.get('error'):
365
  logger.warning(f"⚠️ Med42-70B returned error: {result['error']}")
366
- # Attempt fallback instead of raising exception
367
- return self._attempt_fallback_generation(prompt, result['error'])
 
368
 
369
  # Check for empty response
370
  if not result.get('raw_response', '').strip():
@@ -514,7 +516,7 @@ class MedicalAdviceGenerator:
514
  "disclaimer": "This system experienced a technical error. Please consult with qualified healthcare providers for medical decisions."
515
  }
516
 
517
- def _attempt_fallback_generation(self, original_prompt: str, primary_error: str) -> Dict[str, Any]:
518
  """
519
  Orchestrate fallback generation attempts with detailed logging
520
 
@@ -524,21 +526,22 @@ class MedicalAdviceGenerator:
524
  Args:
525
  original_prompt: The complete RAG prompt that failed in primary generation
526
  primary_error: Error details from the primary generation attempt
 
527
 
528
  Returns:
529
  Dict containing successful fallback response or final error response
530
  """
531
  logger.info("🔄 FALLBACK: Attempting fallback generation strategies")
532
 
533
- # Fallback 1: Simplified Med42-70B without RAG context
534
  try:
535
- logger.info("📍 FALLBACK 1: Med42-70B without RAG context")
536
- fallback_1_result = self._attempt_simplified_med42(original_prompt, primary_error)
537
 
538
  if not fallback_1_result.get('error'):
539
- logger.info("✅ FALLBACK 1: Success - Med42-70B without RAG")
540
- # Mark response as fallback method 1
541
- fallback_1_result['fallback_method'] = 'med42_simplified'
542
  fallback_1_result['primary_error'] = primary_error
543
  return fallback_1_result
544
  else:
@@ -547,20 +550,23 @@ class MedicalAdviceGenerator:
547
  except Exception as e:
548
  logger.error(f"❌ FALLBACK 1: Exception - {e}")
549
 
550
- # Fallback 2: RAG-only template response
551
  try:
552
- logger.info("📍 FALLBACK 2: RAG-only template response")
553
- fallback_2_result = self._attempt_rag_template(original_prompt, primary_error)
554
-
555
- if not fallback_2_result.get('error'):
556
- logger.info("✅ FALLBACK 2: Success - RAG template response")
557
- # Mark response as fallback method 2
558
- fallback_2_result['fallback_method'] = 'rag_template'
559
- fallback_2_result['primary_error'] = primary_error
560
- return fallback_2_result
561
- else:
562
- logger.warning(f"❌ FALLBACK 2: Failed - {fallback_2_result.get('error')}")
563
-
 
 
 
564
  except Exception as e:
565
  logger.error(f"❌ FALLBACK 2: Exception - {e}")
566
 
@@ -588,15 +594,17 @@ class MedicalAdviceGenerator:
588
  'latency': 0.0
589
  }
590
 
591
- def _attempt_simplified_med42(self, original_prompt: str, primary_error: str) -> Dict[str, Any]:
 
592
  """
593
- Attempt Med42-70B generation with simplified prompt (Fallback 1)
594
 
595
- This method retries generation using the same Med42-70B model but with:
596
- - Simplified prompt (user query only, no RAG context)
597
- - Reduced timeout (15 seconds)
598
- - Reduced token limit (300 tokens)
599
- - Higher success probability due to reduced complexity
 
600
 
601
  Args:
602
  original_prompt: Original RAG prompt that failed
@@ -691,7 +699,7 @@ class MedicalAdviceGenerator:
691
  Returns:
692
  Dict with template response or error details
693
  """
694
- logger.info("📍 FALLBACK 2: RAG-only template response")
695
 
696
  try:
697
  # Extract user query and RAG context from original prompt
@@ -699,7 +707,7 @@ class MedicalAdviceGenerator:
699
  rag_context = self._extract_rag_context_from_prompt(original_prompt)
700
 
701
  if not user_query:
702
- logger.error("❌ FALLBACK 2: Failed to extract user query")
703
  return {
704
  'error': 'Unable to extract user query for template response',
705
  'fallback_method': 'rag_template'
@@ -713,11 +721,11 @@ class MedicalAdviceGenerator:
713
  # Create full template response with RAG context
714
  template_response = self._generate_rag_template_response(user_query, rag_context)
715
 
716
- logger.info("✅ FALLBACK 2: Success - RAG template response")
717
 
718
  return {
719
  'extracted_condition': 'rag_template_response',
720
- 'confidence': str(FALLBACK_CONFIDENCE_SCORES['fallback_2']), # 0.4
721
  'raw_response': template_response,
722
  'fallback_method': 'rag_template',
723
  'primary_error': primary_error,
 
28
  )
29
  logger = logging.getLogger(__name__)
30
 
31
+ # Fallback Generation Configuration (Simplified Architecture)
32
  FALLBACK_TIMEOUTS = {
33
  "primary": 30.0, # Primary Med42-70B with full RAG context
34
+ "fallback_1": 1.0, # RAG template generation (renamed from fallback_2)
35
+ "fallback_2": 0.1 # Minimal template generation (instant)
36
  }
37
 
38
  FALLBACK_TOKEN_LIMITS = {
39
+ "primary": 1600, # Full comprehensive medical advice (increased)
40
+ "fallback_1": 0, # RAG template-based, no LLM tokens (renamed from fallback_2)
41
+ "fallback_2": 0 # Minimal template-based, no LLM tokens
42
  }
43
 
44
  FALLBACK_CONFIDENCE_SCORES = {
45
+ "fallback_1": 0.4, # RAG template only (renamed from fallback_2)
46
+ "fallback_2": 0.2 # Minimal template only
47
  }
48
 
49
  FALLBACK_ERROR_TRIGGERS = {
 
323
  {focus_guidance}
324
 
325
  Provide guidance with:
326
+ • Prioritize information and evidence from above sources (PRIMARY)
327
+ • Use your medical knowledge to organize guidelines into actionable steps
328
  • Numbered points (1. 2. 3.) for key steps
329
  • Line breaks between major sections
330
  • Highlight medications with dosages and routes
331
+ Emphasize clinical judgment for individual patient factors (SECONDARY)
 
332
 
333
  IMPORTANT: Keep response under 1000 words. Use concise numbered points. For complex cases with multiple conditions, address the most urgent condition first, then relevant comorbidities. Prioritize actionable clinical steps over theoretical explanations.
334
 
 
364
  # Check for API errors in response
365
  if result.get('error'):
366
  logger.warning(f"⚠️ Med42-70B returned error: {result['error']}")
367
+ # Pass any available content for potential simplification
368
+ primary_content = result.get('raw_response', '')
369
+ return self._attempt_fallback_generation(prompt, result['error'], primary_content)
370
 
371
  # Check for empty response
372
  if not result.get('raw_response', '').strip():
 
516
  "disclaimer": "This system experienced a technical error. Please consult with qualified healthcare providers for medical decisions."
517
  }
518
 
519
+ def _attempt_fallback_generation(self, original_prompt: str, primary_error: str, primary_result: str = None) -> Dict[str, Any]:
520
  """
521
  Orchestrate fallback generation attempts with detailed logging
522
 
 
526
  Args:
527
  original_prompt: The complete RAG prompt that failed in primary generation
528
  primary_error: Error details from the primary generation attempt
529
+ primary_result: Primary result content (if available) for simplification
530
 
531
  Returns:
532
  Dict containing successful fallback response or final error response
533
  """
534
  logger.info("🔄 FALLBACK: Attempting fallback generation strategies")
535
 
536
+ # Fallback 1: RAG-only template response (renamed from fallback_2)
537
  try:
538
+ logger.info("📍 FALLBACK 1: RAG-only template response")
539
+ fallback_1_result = self._attempt_rag_template(original_prompt, primary_error)
540
 
541
  if not fallback_1_result.get('error'):
542
+ logger.info("✅ FALLBACK 1: Success - RAG template response")
543
+ # Mark response as fallback method 1 (renamed)
544
+ fallback_1_result['fallback_method'] = 'rag_template'
545
  fallback_1_result['primary_error'] = primary_error
546
  return fallback_1_result
547
  else:
 
550
  except Exception as e:
551
  logger.error(f"❌ FALLBACK 1: Exception - {e}")
552
 
553
+ # Fallback 2: Minimal template response (renamed from fallback_3)
554
  try:
555
+ logger.info("📍 FALLBACK 2: Minimal template response")
556
+ user_query = self._extract_user_query_from_prompt(original_prompt)
557
+ minimal_response = self._generate_minimal_template_response(user_query or "medical query")
558
+
559
+ logger.info("✅ FALLBACK 2: Success - Minimal template response")
560
+ return {
561
+ 'extracted_condition': 'minimal_template_response',
562
+ 'confidence': str(FALLBACK_CONFIDENCE_SCORES['fallback_2']),
563
+ 'raw_response': minimal_response,
564
+ 'fallback_method': 'minimal_template',
565
+ 'primary_error': primary_error,
566
+ 'latency': 0.1,
567
+ 'template_based': True
568
+ }
569
+
570
  except Exception as e:
571
  logger.error(f"❌ FALLBACK 2: Exception - {e}")
572
 
 
594
  'latency': 0.0
595
  }
596
 
597
+
598
+ def _attempt_rag_template(self, original_prompt: str, primary_error: str) -> Dict[str, Any]:
599
  """
600
+ Generate template-based response using available RAG context (Fallback 1)
601
 
602
+ This method creates a structured response using retrieved medical guidelines
603
+ without LLM processing:
604
+ - Instant response (no API calls)
605
+ - Template-based formatting
606
+ - Uses extracted RAG context from original prompt
607
+ - Lower confidence score (0.4)
608
 
609
  Args:
610
  original_prompt: Original RAG prompt that failed
 
699
  Returns:
700
  Dict with template response or error details
701
  """
702
+ logger.info("📍 FALLBACK 1: RAG-only template response")
703
 
704
  try:
705
  # Extract user query and RAG context from original prompt
 
707
  rag_context = self._extract_rag_context_from_prompt(original_prompt)
708
 
709
  if not user_query:
710
+ logger.error("❌ FALLBACK 1: Failed to extract user query")
711
  return {
712
  'error': 'Unable to extract user query for template response',
713
  'fallback_method': 'rag_template'
 
721
  # Create full template response with RAG context
722
  template_response = self._generate_rag_template_response(user_query, rag_context)
723
 
724
+ logger.info("✅ FALLBACK 1: Success - RAG template response")
725
 
726
  return {
727
  'extracted_condition': 'rag_template_response',
728
+ 'confidence': str(FALLBACK_CONFIDENCE_SCORES['fallback_1']), # 0.4 (renamed)
729
  'raw_response': template_response,
730
  'fallback_method': 'rag_template',
731
  'primary_error': primary_error,
src/llm_clients.py CHANGED
@@ -236,13 +236,7 @@ class llm_Med42_70BClient:
236
 
237
  # Detect short response (less than 2 characters)
238
  if len(response_stripped) < 2:
239
- return True
240
-
241
- # Detect long response - allow some flexibility for detailed medical advice
242
- # 750 words ≈ 1000-1200 chars, allow some flexibility to 2500 chars
243
- if len(response_stripped) > 2500: # Changed from 1000 to 2500
244
- self.logger.warning(f"Response extremely long: {len(response_stripped)} chars")
245
- return True
246
 
247
  return False
248
 
 
236
 
237
  # Detect short response (less than 2 characters)
238
  if len(response_stripped) < 2:
239
+ return True
 
 
 
 
 
 
240
 
241
  return False
242
 
src/medical_conditions.py CHANGED
@@ -55,6 +55,14 @@ CONDITION_KEYWORD_MAPPING: Dict[str, Dict[str, str]] = {
55
  "acute_coronary_syndrome": {
56
  "emergency": "ACS|chest pain|ECG changes",
57
  "treatment": "antiplatelet|statins|cardiac monitoring"
 
 
 
 
 
 
 
 
58
  }
59
  }
60
 
 
55
  "acute_coronary_syndrome": {
56
  "emergency": "ACS|chest pain|ECG changes",
57
  "treatment": "antiplatelet|statins|cardiac monitoring"
58
+ },
59
+ "acute seizure": {
60
+ "emergency": "seizure|convulsion|epilepsy|loss of consciousness",
61
+ "treatment": "anticonvulsant|benzodiazepine|neurologic assessment"
62
+ },
63
+ "seizure disorder": {
64
+ "emergency": "seizure|status epilepticus|postictal state",
65
+ "treatment": "antiepileptic drugs|EEG monitoring|neurology consult"
66
  }
67
  }
68
 
src/user_prompt.py CHANGED
@@ -114,14 +114,35 @@ class UserPromptProcessor:
114
  return predefined_result
115
  logger.info("❌ LEVEL 1: FAILED - No predefined mapping found")
116
 
117
- # Level 2: Llama3-Med42-70B Extraction (if available)
118
- logger.info("📍 LEVEL 2: Attempting LLM extraction...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119
  if self.llm_client:
120
  llm_result = self._extract_with_llm(user_query)
121
  if llm_result:
122
- logger.info("✅ LEVEL 2: SUCCESS - LLM extraction successful")
123
  return llm_result
124
- logger.info("❌ LEVEL 2: FAILED - LLM extraction failed")
125
  else:
126
  logger.info("⏭️ LEVEL 2: SKIPPED - No LLM client available")
127
 
@@ -153,9 +174,11 @@ class UserPromptProcessor:
153
  # No match found
154
  logger.warning("🚫 ALL LEVELS FAILED - Returning empty result")
155
  return {
 
156
  'condition': '',
157
  'emergency_keywords': '',
158
- 'treatment_keywords': ''
 
159
  }
160
 
161
  def _predefined_mapping(self, user_query: str) -> Optional[Dict[str, str]]:
@@ -184,6 +207,127 @@ class UserPromptProcessor:
184
 
185
  return None
186
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
187
  def _extract_with_llm(self, user_query: str) -> Optional[Dict[str, str]]:
188
  """
189
  Use Llama3-Med42-70B for advanced condition extraction
@@ -606,7 +750,7 @@ Please confirm:
606
  Dict with invalid query guidance
607
  """
608
  return {
609
- 'type': 'invalid_query',
610
  'message': "This is OnCall.AI, a clinical medical assistance platform. "
611
  "Please input a medical problem you need help resolving. "
612
  "\n\nExamples:\n"
 
114
  return predefined_result
115
  logger.info("❌ LEVEL 1: FAILED - No predefined mapping found")
116
 
117
+ # Level 2+4 Combined: Single LLM call for dual processing
118
+ logger.info("📍 LEVEL 2+4 COMBINED: Attempting unified extraction + validation")
119
+ if self.llm_client:
120
+ combined_result = self._combined_llm_extraction_validation(user_query)
121
+ if combined_result:
122
+ if combined_result['query_status'] == 'condition_found':
123
+ logger.info("✅ LEVEL 2+4: SUCCESS - Condition extracted")
124
+ return combined_result
125
+ elif combined_result['query_status'] in ['non_medical', 'invalid_query']:
126
+ logger.info("✅ LEVEL 2+4: Query identified as non-medical")
127
+ return combined_result
128
+ elif combined_result['query_status'] == 'medical_no_condition':
129
+ logger.info("✅ LEVEL 2+4: Medical query confirmed, proceeding to semantic search")
130
+ # Continue to Level 3 since query is validated as medical
131
+ else:
132
+ logger.info("❌ LEVEL 2+4: Combined processing failed, falling back to individual levels")
133
+ else:
134
+ logger.info("❌ LEVEL 2+4: Combined processing failed, falling back to individual levels")
135
+ else:
136
+ logger.info("⏭️ LEVEL 2+4: SKIPPED - No LLM client available")
137
+
138
+ # Level 2: Fallback LLM Extraction (if combined failed)
139
+ logger.info("📍 LEVEL 2: Attempting individual LLM extraction...")
140
  if self.llm_client:
141
  llm_result = self._extract_with_llm(user_query)
142
  if llm_result:
143
+ logger.info("✅ LEVEL 2: SUCCESS - Individual LLM extraction successful")
144
  return llm_result
145
+ logger.info("❌ LEVEL 2: FAILED - Individual LLM extraction failed")
146
  else:
147
  logger.info("⏭️ LEVEL 2: SKIPPED - No LLM client available")
148
 
 
174
  # No match found
175
  logger.warning("🚫 ALL LEVELS FAILED - Returning empty result")
176
  return {
177
+ 'query_status': 'no_match_found',
178
  'condition': '',
179
  'emergency_keywords': '',
180
+ 'treatment_keywords': '',
181
+ 'message': 'Unable to process medical query'
182
  }
183
 
184
  def _predefined_mapping(self, user_query: str) -> Optional[Dict[str, str]]:
 
207
 
208
  return None
209
 
210
+ def _combined_llm_extraction_validation(self, user_query: str) -> Optional[Dict[str, Any]]:
211
+ """
212
+ Combined Level 2+4: Extract condition AND validate medical query in single LLM call
213
+ Expected time reduction: 16-25s → 12-15s (40% improvement)
214
+
215
+ Args:
216
+ user_query: User's medical query
217
+
218
+ Returns:
219
+ Dict with combined result indicating:
220
+ - condition_found: Specific medical condition extracted
221
+ - medical_no_condition: Medical query but no specific condition
222
+ - non_medical: Non-medical query (reject)
223
+ """
224
+ if not self.llm_client:
225
+ return None
226
+
227
+ try:
228
+ # Combined prompt for both extraction and validation
229
+ combined_prompt = f"""Medical Query Analysis - Dual Task Processing:
230
+
231
+ QUERY: "{user_query}"
232
+
233
+ TASKS:
234
+ 1. Extract primary medical condition (if specific condition identifiable)
235
+ 2. Determine if this is a medical-related query
236
+
237
+ RESPONSE FORMAT:
238
+ MEDICAL: YES/NO
239
+ CONDITION: [specific condition name or "NONE"]
240
+ CONFIDENCE: [0.1-1.0]
241
+
242
+ EXAMPLES:
243
+ - "chest pain and shortness of breath" → MEDICAL: YES, CONDITION: Acute Coronary Syndrome, CONFIDENCE: 0.9
244
+ - "how to cook pasta safely" → MEDICAL: NO, CONDITION: NONE, CONFIDENCE: 0.95
245
+ - "persistent headache treatment options" → MEDICAL: YES, CONDITION: Headache Disorder, CONFIDENCE: 0.8
246
+ - "feeling unwell lately" → MEDICAL: YES, CONDITION: NONE, CONFIDENCE: 0.6
247
+
248
+ Return ONLY the specified format."""
249
+
250
+ logger.info("🤖 COMBINED L2+4: Single LLM call for extraction + validation")
251
+
252
+ llama_response = self.llm_client.analyze_medical_query(
253
+ query=combined_prompt,
254
+ max_tokens=100, # Keep concise for condition name
255
+ timeout=12.0 # Single call timeout
256
+ )
257
+
258
+ response_text = llama_response.get('extracted_condition', '').strip()
259
+ logger.info(f"🤖 Combined L2+4 result: {response_text}")
260
+
261
+ # Parse structured response
262
+ medical_status = self._extract_field(response_text, 'MEDICAL')
263
+ condition_name = self._extract_field(response_text, 'CONDITION')
264
+ confidence = self._extract_field(response_text, 'CONFIDENCE')
265
+
266
+ # Non-medical query detection
267
+ if medical_status == 'NO':
268
+ logger.info("✅ COMBINED L2+4: Identified as non-medical query")
269
+ return {
270
+ 'query_status': 'non_medical',
271
+ 'message': 'This appears to be a non-medical query.',
272
+ 'condition': '',
273
+ 'emergency_keywords': '',
274
+ 'treatment_keywords': '',
275
+ 'extraction_method': 'combined_llm_rejection',
276
+ 'confidence': float(confidence) if confidence else 0.9
277
+ }
278
+
279
+ # Medical query with specific condition
280
+ if condition_name and condition_name != 'NONE':
281
+ standardized_condition = self._extract_condition_from_query(condition_name)
282
+ if standardized_condition:
283
+ condition_details = get_condition_details(standardized_condition)
284
+ if condition_details:
285
+ logger.info(f"✅ COMBINED L2+4: Success - {standardized_condition}")
286
+ return {
287
+ 'query_status': 'condition_found',
288
+ 'condition': standardized_condition,
289
+ 'emergency_keywords': condition_details['emergency'],
290
+ 'treatment_keywords': condition_details['treatment'],
291
+ 'extraction_method': 'combined_llm',
292
+ 'confidence': float(confidence) if confidence else 0.8
293
+ }
294
+
295
+ # Medical query but no specific condition extractable
296
+ if medical_status == 'YES':
297
+ logger.info("✅ COMBINED L2+4: Medical query confirmed, no specific condition")
298
+ return {
299
+ 'query_status': 'medical_no_condition',
300
+ 'message': 'Medical query confirmed, proceeding to semantic search',
301
+ 'condition': '',
302
+ 'emergency_keywords': '',
303
+ 'treatment_keywords': '',
304
+ 'extraction_method': 'combined_llm_medical_only',
305
+ 'confidence': float(confidence) if confidence else 0.6
306
+ }
307
+
308
+ logger.info("❌ COMBINED L2+4: No valid condition extracted")
309
+ return None
310
+
311
+ except Exception as e:
312
+ logger.error(f"Combined L2+4 extraction failed: {e}")
313
+ return None
314
+
315
+ def _extract_field(self, text: str, field_name: str) -> Optional[str]:
316
+ """
317
+ Extract specific field from structured LLM response
318
+
319
+ Args:
320
+ text: Raw LLM response text
321
+ field_name: Field to extract (MEDICAL, CONDITION, CONFIDENCE)
322
+
323
+ Returns:
324
+ Extracted field value or None
325
+ """
326
+ import re
327
+ pattern = rf"{field_name}:\s*([^\n,]+)"
328
+ match = re.search(pattern, text, re.IGNORECASE)
329
+ return match.group(1).strip() if match else None
330
+
331
  def _extract_with_llm(self, user_query: str) -> Optional[Dict[str, str]]:
332
  """
333
  Use Llama3-Med42-70B for advanced condition extraction
 
750
  Dict with invalid query guidance
751
  """
752
  return {
753
+ 'query_status': 'invalid_query',
754
  'message': "This is OnCall.AI, a clinical medical assistance platform. "
755
  "Please input a medical problem you need help resolving. "
756
  "\n\nExamples:\n"