YanBoChen commited on
Commit
5f9dffa
·
1 Parent(s): 253609b

fix(evaluation): improve evaluation instructions and add structured assessment phases

Browse files
Files changed (1) hide show
  1. evaluation/evaluation_instruction.md +130 -36
evaluation/evaluation_instruction.md CHANGED
@@ -1,4 +1,5 @@
1
  # Model use
 
2
  llm model: (for comparison) with our-own version.
3
  https://huggingface.co/aaditya/Llama3-OpenBioLLM-70B
4
  https://huggingface.co/m42-health/Llama3-Med42-70B
@@ -12,59 +13,59 @@ https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct
12
  """
13
  ```
14
 
15
-
16
  ### 評估執行流程
 
17
  ```python
18
  def run_complete_evaluation(model_name: str, test_cases: List[str]) -> Dict[str, Any]:
19
  """執行完整的六項指標評估"""
20
-
21
  results = {
22
  "model": model_name,
23
  "metrics": {},
24
  "detailed_results": []
25
  }
26
-
27
  total_latencies = []
28
  extraction_successes = []
29
  relevance_scores = []
30
  coverage_scores = []
31
  actionability_scores = []
32
  evidence_scores = []
33
-
34
  for query in test_cases:
35
  # 運行模型並測量所有指標
36
  start_time = time.time()
37
-
38
  # 1. 總處理時長
39
  latency_result = measure_total_latency(query)
40
  total_latencies.append(latency_result['total_latency'])
41
-
42
  # 2. 條件抽取成功率
43
  extraction_result = evaluate_condition_extraction([query])
44
  extraction_successes.append(extraction_result['success_rate'])
45
-
46
  # 3 & 4. 檢索相關性和覆蓋率(需要實際檢索結果)
47
  retrieval_results = get_retrieval_results(query)
48
  relevance_result = evaluate_retrieval_relevance(retrieval_results)
49
  relevance_scores.append(relevance_result['average_relevance'])
50
-
51
  generated_advice = get_generated_advice(query, retrieval_results)
52
  coverage_result = evaluate_retrieval_coverage(generated_advice, retrieval_results)
53
  coverage_scores.append(coverage_result['coverage'])
54
-
55
  # 5 & 6. LLM 評估(需要完整回應)
56
  response_data = {
57
  'query': query,
58
  'advice': generated_advice,
59
  'retrieval_results': retrieval_results
60
  }
61
-
62
  actionability_result = evaluate_clinical_actionability([response_data])
63
  actionability_scores.append(actionability_result[0]['overall_score'])
64
-
65
  evidence_result = evaluate_clinical_evidence([response_data])
66
  evidence_scores.append(evidence_result[0]['overall_score'])
67
-
68
  # 記錄詳細結果
69
  results["detailed_results"].append({
70
  "query": query,
@@ -75,7 +76,7 @@ def run_complete_evaluation(model_name: str, test_cases: List[str]) -> Dict[str,
75
  "actionability": actionability_result[0],
76
  "evidence": evidence_result[0]
77
  })
78
-
79
  # 計算平均指標
80
  results["metrics"] = {
81
  "average_latency": sum(total_latencies) / len(total_latencies),
@@ -85,7 +86,7 @@ def run_complete_evaluation(model_name: str, test_cases: List[str]) -> Dict[str,
85
  "average_actionability": sum(actionability_scores) / len(actionability_scores),
86
  "average_evidence_score": sum(evidence_scores) / len(evidence_scores)
87
  }
88
-
89
  return results
90
  ```
91
 
@@ -94,41 +95,43 @@ def run_complete_evaluation(model_name: str, test_cases: List[str]) -> Dict[str,
94
  ## 📈 評估結果分析框架
95
 
96
  ### 統計分析
 
97
  ```python
98
  def analyze_evaluation_results(results_A: Dict, results_B: Dict, results_C: Dict) -> Dict:
99
  """比較三個模型的評估結果"""
100
-
101
  models = ['Med42-70B_direct', 'RAG_enhanced', 'OpenBioLLM-70B']
102
  metrics = ['latency', 'extraction_success_rate', 'relevance', 'coverage', 'actionability', 'evidence_score']
103
-
104
  comparison = {}
105
-
106
  for metric in metrics:
107
  comparison[metric] = {
108
  models[0]: results_A['metrics'][f'average_{metric}'],
109
  models[1]: results_B['metrics'][f'average_{metric}'],
110
  models[2]: results_C['metrics'][f'average_{metric}']
111
  }
112
-
113
  # 計算相對改進
114
  baseline = comparison[metric][models[0]]
115
  rag_improvement = ((comparison[metric][models[1]] - baseline) / baseline) * 100
116
-
117
  comparison[metric]['rag_improvement_percent'] = rag_improvement
118
-
119
  return comparison
120
  ```
121
 
122
  ### 報告生成
 
123
  ```python
124
  def generate_evaluation_report(comparison_results: Dict) -> str:
125
  """生成評估報告"""
126
-
127
  report = f"""
128
  # OnCall.ai 系統評估報告
129
-
130
  ## 評估摘要
131
-
132
  | 指標 | Med42-70B | RAG增強版 | OpenBioLLM | RAG改進% |
133
  |------|-----------|-----------|------------|----------|
134
  | 處理時長 | {comparison_results['latency']['Med42-70B_direct']:.2f}s | {comparison_results['latency']['RAG_enhanced']:.2f}s | {comparison_results['latency']['OpenBioLLM-70B']:.2f}s | {comparison_results['latency']['rag_improvement_percent']:+.1f}% |
@@ -137,9 +140,9 @@ def generate_evaluation_report(comparison_results: Dict) -> str:
137
  | 檢索覆蓋率 | - | {comparison_results['coverage']['RAG_enhanced']:.1%} | - | - |
138
  | 臨床可操作性 | {comparison_results['actionability']['Med42-70B_direct']:.1f}/10 | {comparison_results['actionability']['RAG_enhanced']:.1f}/10 | {comparison_results['actionability']['OpenBioLLM-70B']:.1f}/10 | {comparison_results['actionability']['rag_improvement_percent']:+.1f}% |
139
  | 臨床證據評分 | {comparison_results['evidence_score']['Med42-70B_direct']:.1f}/10 | {comparison_results['evidence_score']['RAG_enhanced']:.1f}/10 | {comparison_results['evidence_score']['OpenBioLLM-70B']:.1f}/10 | {comparison_results['evidence_score']['rag_improvement_percent']:+.1f}% |
140
-
141
  """
142
-
143
  return report
144
  ```
145
 
@@ -148,6 +151,7 @@ def generate_evaluation_report(comparison_results: Dict) -> str:
148
  ## 🔧 實驗執行步驟
149
 
150
  ### 1. 環境準備
 
151
  ```bash
152
  # 設置 HuggingFace token(用於 Inference Providers)
153
  export HF_TOKEN=your_huggingface_token
@@ -157,48 +161,49 @@ export ONCALL_EVAL_MODE=true
157
  ```
158
 
159
  ### 2. 實驗執行腳本框架
 
160
  ```python
161
  # evaluation/run_evaluation.py
162
  def main():
163
  """主要評估執行函數"""
164
-
165
  # 加載測試用例
166
  test_cases = MEDICAL_TEST_CASES
167
-
168
  # 實驗 A: YanBo 系統評估
169
  print("🔬 開始實驗 A: YanBo 系統評估")
170
  results_med42_direct = run_complete_evaluation("Med42-70B_direct", test_cases)
171
- results_general_rag = run_complete_evaluation("Med42-70B_general_RAG", test_cases)
172
  results_openbio = run_complete_evaluation("OpenBioLLM-70B", test_cases)
173
-
174
  # 分析和報告
175
  comparison_A = analyze_evaluation_results(results_med42_direct, results_general_rag, results_openbio)
176
  report_A = generate_evaluation_report(comparison_A)
177
-
178
  # 保存結果
179
  save_results("evaluation/results/yanbo_evaluation.json", {
180
  "comparison": comparison_A,
181
  "detailed_results": [results_med42_direct, results_general_rag, results_openbio]
182
  })
183
-
184
  print("✅ 實驗 A 完成,結果已保存")
185
-
186
  # 實驗 B: Jeff 系統評估
187
  print("🔬 開始實驗 B: Jeff 系統評估")
188
  results_med42_direct_b = run_complete_evaluation("Med42-70B_direct", test_cases)
189
  results_customized_rag = run_complete_evaluation("Med42-70B_customized_RAG", test_cases)
190
  results_openbio_b = run_complete_evaluation("OpenBioLLM-70B", test_cases)
191
-
192
  # 分析和報告
193
  comparison_B = analyze_evaluation_results(results_med42_direct_b, results_customized_rag, results_openbio_b)
194
  report_B = generate_evaluation_report(comparison_B)
195
-
196
  # 保存結果
197
  save_results("evaluation/results/jeff_evaluation.json", {
198
  "comparison": comparison_B,
199
  "detailed_results": [results_med42_direct_b, results_customized_rag, results_openbio_b]
200
  })
201
-
202
  print("✅ 實驗 B 完成,結果已保存")
203
 
204
  if __name__ == "__main__":
@@ -206,6 +211,7 @@ if __name__ == "__main__":
206
  ```
207
 
208
  ### 3. 預期評估時間
 
209
  ```
210
  總評估時間估算:
211
  ├── 每個查詢處理時間:~30秒(包含LLM評估)
@@ -219,10 +225,11 @@ if __name__ == "__main__":
219
  ## 📊 評估成功標準
220
 
221
  ### 系統性能目標
 
222
  ```
223
  ✅ 達標條件:
224
  1. 總處理時長 ≤ 30秒
225
- 2. 條件抽取成功率 ≥ 80%
226
  3. 檢索相關性 ≥ 0.2
227
  4. 檢索覆蓋率 ≥ 60%
228
  5. 臨床可操作性 ≥ 7.0/10
@@ -234,6 +241,7 @@ if __name__ == "__main__":
234
  ```
235
 
236
  ### 比較分析重點
 
237
  ```
238
  重點分析維度:
239
  ├── RAG 對處理時間的影響(可能增加延遲)
@@ -247,6 +255,7 @@ if __name__ == "__main__":
247
  ## 🛠️ 實施建議
248
 
249
  ### 分階段實施
 
250
  ```
251
  階段1: 基礎指標實現(1-4項)
252
  ├── 利用現有 app.py 中的時間測量
@@ -268,6 +277,7 @@ if __name__ == "__main__":
268
  ```
269
 
270
  ### 實施注意事項
 
271
  ```
272
  ⚠️ 重要提醒:
273
  1. 所有評估代碼應獨立於現有系統,避免影響正常運行
@@ -280,3 +290,87 @@ if __name__ == "__main__":
280
  ---
281
 
282
  **評估指南完成。請根據此指南實施評估實驗。**
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # Model use
2
+
3
  llm model: (for comparison) with our-own version.
4
  https://huggingface.co/aaditya/Llama3-OpenBioLLM-70B
5
  https://huggingface.co/m42-health/Llama3-Med42-70B
 
13
  """
14
  ```
15
 
 
16
  ### 評估執行流程
17
+
18
  ```python
19
  def run_complete_evaluation(model_name: str, test_cases: List[str]) -> Dict[str, Any]:
20
  """執行完整的六項指標評估"""
21
+
22
  results = {
23
  "model": model_name,
24
  "metrics": {},
25
  "detailed_results": []
26
  }
27
+
28
  total_latencies = []
29
  extraction_successes = []
30
  relevance_scores = []
31
  coverage_scores = []
32
  actionability_scores = []
33
  evidence_scores = []
34
+
35
  for query in test_cases:
36
  # 運行模型並測量所有指標
37
  start_time = time.time()
38
+
39
  # 1. 總處理時長
40
  latency_result = measure_total_latency(query)
41
  total_latencies.append(latency_result['total_latency'])
42
+
43
  # 2. 條件抽取成功率
44
  extraction_result = evaluate_condition_extraction([query])
45
  extraction_successes.append(extraction_result['success_rate'])
46
+
47
  # 3 & 4. 檢索相關性和覆蓋率(需要實際檢索結果)
48
  retrieval_results = get_retrieval_results(query)
49
  relevance_result = evaluate_retrieval_relevance(retrieval_results)
50
  relevance_scores.append(relevance_result['average_relevance'])
51
+
52
  generated_advice = get_generated_advice(query, retrieval_results)
53
  coverage_result = evaluate_retrieval_coverage(generated_advice, retrieval_results)
54
  coverage_scores.append(coverage_result['coverage'])
55
+
56
  # 5 & 6. LLM 評估(需要完整回應)
57
  response_data = {
58
  'query': query,
59
  'advice': generated_advice,
60
  'retrieval_results': retrieval_results
61
  }
62
+
63
  actionability_result = evaluate_clinical_actionability([response_data])
64
  actionability_scores.append(actionability_result[0]['overall_score'])
65
+
66
  evidence_result = evaluate_clinical_evidence([response_data])
67
  evidence_scores.append(evidence_result[0]['overall_score'])
68
+
69
  # 記錄詳細結果
70
  results["detailed_results"].append({
71
  "query": query,
 
76
  "actionability": actionability_result[0],
77
  "evidence": evidence_result[0]
78
  })
79
+
80
  # 計算平均指標
81
  results["metrics"] = {
82
  "average_latency": sum(total_latencies) / len(total_latencies),
 
86
  "average_actionability": sum(actionability_scores) / len(actionability_scores),
87
  "average_evidence_score": sum(evidence_scores) / len(evidence_scores)
88
  }
89
+
90
  return results
91
  ```
92
 
 
95
  ## 📈 評估結果分析框架
96
 
97
  ### 統計分析
98
+
99
  ```python
100
  def analyze_evaluation_results(results_A: Dict, results_B: Dict, results_C: Dict) -> Dict:
101
  """比較三個模型的評估結果"""
102
+
103
  models = ['Med42-70B_direct', 'RAG_enhanced', 'OpenBioLLM-70B']
104
  metrics = ['latency', 'extraction_success_rate', 'relevance', 'coverage', 'actionability', 'evidence_score']
105
+
106
  comparison = {}
107
+
108
  for metric in metrics:
109
  comparison[metric] = {
110
  models[0]: results_A['metrics'][f'average_{metric}'],
111
  models[1]: results_B['metrics'][f'average_{metric}'],
112
  models[2]: results_C['metrics'][f'average_{metric}']
113
  }
114
+
115
  # 計算相對改進
116
  baseline = comparison[metric][models[0]]
117
  rag_improvement = ((comparison[metric][models[1]] - baseline) / baseline) * 100
118
+
119
  comparison[metric]['rag_improvement_percent'] = rag_improvement
120
+
121
  return comparison
122
  ```
123
 
124
  ### 報告生成
125
+
126
  ```python
127
  def generate_evaluation_report(comparison_results: Dict) -> str:
128
  """生成評估報告"""
129
+
130
  report = f"""
131
  # OnCall.ai 系統評估報告
132
+
133
  ## 評估摘要
134
+
135
  | 指標 | Med42-70B | RAG增強版 | OpenBioLLM | RAG改進% |
136
  |------|-----------|-----------|------------|----------|
137
  | 處理時長 | {comparison_results['latency']['Med42-70B_direct']:.2f}s | {comparison_results['latency']['RAG_enhanced']:.2f}s | {comparison_results['latency']['OpenBioLLM-70B']:.2f}s | {comparison_results['latency']['rag_improvement_percent']:+.1f}% |
 
140
  | 檢索覆蓋率 | - | {comparison_results['coverage']['RAG_enhanced']:.1%} | - | - |
141
  | 臨床可操作性 | {comparison_results['actionability']['Med42-70B_direct']:.1f}/10 | {comparison_results['actionability']['RAG_enhanced']:.1f}/10 | {comparison_results['actionability']['OpenBioLLM-70B']:.1f}/10 | {comparison_results['actionability']['rag_improvement_percent']:+.1f}% |
142
  | 臨床證據評分 | {comparison_results['evidence_score']['Med42-70B_direct']:.1f}/10 | {comparison_results['evidence_score']['RAG_enhanced']:.1f}/10 | {comparison_results['evidence_score']['OpenBioLLM-70B']:.1f}/10 | {comparison_results['evidence_score']['rag_improvement_percent']:+.1f}% |
143
+
144
  """
145
+
146
  return report
147
  ```
148
 
 
151
  ## 🔧 實驗執行步驟
152
 
153
  ### 1. 環境準備
154
+
155
  ```bash
156
  # 設置 HuggingFace token(用於 Inference Providers)
157
  export HF_TOKEN=your_huggingface_token
 
161
  ```
162
 
163
  ### 2. 實驗執行腳本框架
164
+
165
  ```python
166
  # evaluation/run_evaluation.py
167
  def main():
168
  """主要評估執行函數"""
169
+
170
  # 加載測試用例
171
  test_cases = MEDICAL_TEST_CASES
172
+
173
  # 實驗 A: YanBo 系統評估
174
  print("🔬 開始實驗 A: YanBo 系統評估")
175
  results_med42_direct = run_complete_evaluation("Med42-70B_direct", test_cases)
176
+ results_general_rag = run_complete_evaluation("Med42-70B_general_RAG", test_cases)
177
  results_openbio = run_complete_evaluation("OpenBioLLM-70B", test_cases)
178
+
179
  # 分析和報告
180
  comparison_A = analyze_evaluation_results(results_med42_direct, results_general_rag, results_openbio)
181
  report_A = generate_evaluation_report(comparison_A)
182
+
183
  # 保存結果
184
  save_results("evaluation/results/yanbo_evaluation.json", {
185
  "comparison": comparison_A,
186
  "detailed_results": [results_med42_direct, results_general_rag, results_openbio]
187
  })
188
+
189
  print("✅ 實驗 A 完成,結果已保存")
190
+
191
  # 實驗 B: Jeff 系統評估
192
  print("🔬 開始實驗 B: Jeff 系統評估")
193
  results_med42_direct_b = run_complete_evaluation("Med42-70B_direct", test_cases)
194
  results_customized_rag = run_complete_evaluation("Med42-70B_customized_RAG", test_cases)
195
  results_openbio_b = run_complete_evaluation("OpenBioLLM-70B", test_cases)
196
+
197
  # 分析和報告
198
  comparison_B = analyze_evaluation_results(results_med42_direct_b, results_customized_rag, results_openbio_b)
199
  report_B = generate_evaluation_report(comparison_B)
200
+
201
  # 保存結果
202
  save_results("evaluation/results/jeff_evaluation.json", {
203
  "comparison": comparison_B,
204
  "detailed_results": [results_med42_direct_b, results_customized_rag, results_openbio_b]
205
  })
206
+
207
  print("✅ 實驗 B 完成,結果已保存")
208
 
209
  if __name__ == "__main__":
 
211
  ```
212
 
213
  ### 3. 預期評估時間
214
+
215
  ```
216
  總評估時間估算:
217
  ├── 每個查詢處理時間:~30秒(包含LLM評估)
 
225
  ## 📊 評估成功標準
226
 
227
  ### 系統性能目標
228
+
229
  ```
230
  ✅ 達標條件:
231
  1. 總處理時長 ≤ 30秒
232
+ 2. 條件抽取成功率 ≥ 80%
233
  3. 檢索相關性 ≥ 0.2
234
  4. 檢索覆蓋率 ≥ 60%
235
  5. 臨床可操作性 ≥ 7.0/10
 
241
  ```
242
 
243
  ### 比較分析重點
244
+
245
  ```
246
  重點分析維度:
247
  ├── RAG 對處理時間的影響(可能增加延遲)
 
255
  ## 🛠️ 實施建議
256
 
257
  ### 分階段實施
258
+
259
  ```
260
  階段1: 基礎指標實現(1-4項)
261
  ├── 利用現有 app.py 中的時間測量
 
277
  ```
278
 
279
  ### 實施注意事項
280
+
281
  ```
282
  ⚠️ 重要提醒:
283
  1. 所有評估代碼應獨立於現有系統,避免影響正常運行
 
290
  ---
291
 
292
  **評估指南完成。請根據此指南實施評估實驗。**
293
+
294
+ ## Phase 1: Initial Assessment
295
+
296
+ ### Step 1.1 - 分析您的說明
297
+
298
+ 我重新理解了您的意思!
299
+
300
+ ### Step 1.2 - 無不清楚的需求
301
+
302
+ ### Step 1.3 - 確認理解(中文)
303
+
304
+ 完全明白了!您的評估架構是:
305
+
306
+ ## 🎯 **評估指標的測試層級**
307
+
308
+ ### **單獨測試 Retrieval(指標 2, 3, 4)**
309
+
310
+ ```python
311
+ # 只測試 RAG 系統,因為只有 RAG 系統有這些組件:
312
+ retrieval_only_metrics = [
313
+ "條件抽取成功率", # 只有您的系統有 user_prompt.py
314
+ "檢索相關性", # 只有 RAG 系統有檢索結果
315
+ "檢索覆蓋率" # 只有 RAG 系統有檢索→生成的映射
316
+ ]
317
+
318
+ # 測試對象:
319
+ - Med42-70B_general_RAG(您的系統)✅
320
+ - Med42-70B_customized_RAG(Jeff系統)✅
321
+ - Med42-70B_direct(無RAG)❌ 沒有檢索組件
322
+ - OpenBioLLM-70B(無RAG)❌ 沒有檢索組件
323
+ ```
324
+
325
+ ### **三模型一起測試(指標 1, 5, 6)**
326
+
327
+ ```python
328
+ # 所有模型都能測試的指標:
329
+ universal_metrics = [
330
+ "總處理時長", # 所有模型都有響應時間
331
+ "臨床可操作性", # Llama3-70B 評估所有模型的輸出
332
+ "臨床證據評分" # Llama3-70B 評估所有模型的輸出
333
+ ]
334
+
335
+ # 測試對象:
336
+ - Med42-70B_direct ✅
337
+ - Med42-70B_general_RAG ✅
338
+ - OpenBioLLM-70B ✅
339
+ ```
340
+
341
+ ## 📊 **分層評估策略**
342
+
343
+ ### **階段 1:RAG 系統內部評估**
344
+
345
+ ```python
346
+ # 只測試有 RAG 的系統
347
+ rag_systems = ["Med42-70B_general_RAG", "Med42-70B_customized_RAG"]
348
+
349
+ # 測試 RAG 特有指標
350
+ for system in rag_systems:
351
+ evaluate_retrieval_metrics(system) # 指標 2, 3, 4
352
+ ```
353
+
354
+ ### **階段 2:全模型比較評估**
355
+
356
+ ```python
357
+ # 測試所有三個模型
358
+ all_systems = ["Med42-70B_direct", "Med42-70B_general_RAG", "OpenBioLLM-70B"]
359
+
360
+ # 測試通用指標
361
+ for system in all_systems:
362
+ evaluate_universal_metrics(system) # 指標 1, 5, 6
363
+ ```
364
+
365
+ ### **階段 3:綜合分析**
366
+
367
+ ```python
368
+ # 合併兩階段結果,生成完整報告
369
+ combine_evaluation_results()
370
+ ```
371
+
372
+ ### Step 1.4 - 無視覺修改確認 ✅
373
+
374
+ 此為評估策略討論,無涉及代碼修改。
375
+
376
+ **您的理解完全正確!RAG 特有的指標只能在 RAG 系統內部測試,而通用指標可以跨所有模型比較。這樣的分層評估策略非常合理!**