Spaces:
Sleeping
Sleeping
YanBoChen
commited on
Commit
·
5f9dffa
1
Parent(s):
253609b
fix(evaluation): improve evaluation instructions and add structured assessment phases
Browse files- evaluation/evaluation_instruction.md +130 -36
evaluation/evaluation_instruction.md
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
# Model use
|
|
|
2 |
llm model: (for comparison) with our-own version.
|
3 |
https://huggingface.co/aaditya/Llama3-OpenBioLLM-70B
|
4 |
https://huggingface.co/m42-health/Llama3-Med42-70B
|
@@ -12,59 +13,59 @@ https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct
|
|
12 |
"""
|
13 |
```
|
14 |
|
15 |
-
|
16 |
### 評估執行流程
|
|
|
17 |
```python
|
18 |
def run_complete_evaluation(model_name: str, test_cases: List[str]) -> Dict[str, Any]:
|
19 |
"""執行完整的六項指標評估"""
|
20 |
-
|
21 |
results = {
|
22 |
"model": model_name,
|
23 |
"metrics": {},
|
24 |
"detailed_results": []
|
25 |
}
|
26 |
-
|
27 |
total_latencies = []
|
28 |
extraction_successes = []
|
29 |
relevance_scores = []
|
30 |
coverage_scores = []
|
31 |
actionability_scores = []
|
32 |
evidence_scores = []
|
33 |
-
|
34 |
for query in test_cases:
|
35 |
# 運行模型並測量所有指標
|
36 |
start_time = time.time()
|
37 |
-
|
38 |
# 1. 總處理時長
|
39 |
latency_result = measure_total_latency(query)
|
40 |
total_latencies.append(latency_result['total_latency'])
|
41 |
-
|
42 |
# 2. 條件抽取成功率
|
43 |
extraction_result = evaluate_condition_extraction([query])
|
44 |
extraction_successes.append(extraction_result['success_rate'])
|
45 |
-
|
46 |
# 3 & 4. 檢索相關性和覆蓋率(需要實際檢索結果)
|
47 |
retrieval_results = get_retrieval_results(query)
|
48 |
relevance_result = evaluate_retrieval_relevance(retrieval_results)
|
49 |
relevance_scores.append(relevance_result['average_relevance'])
|
50 |
-
|
51 |
generated_advice = get_generated_advice(query, retrieval_results)
|
52 |
coverage_result = evaluate_retrieval_coverage(generated_advice, retrieval_results)
|
53 |
coverage_scores.append(coverage_result['coverage'])
|
54 |
-
|
55 |
# 5 & 6. LLM 評估(需要完整回應)
|
56 |
response_data = {
|
57 |
'query': query,
|
58 |
'advice': generated_advice,
|
59 |
'retrieval_results': retrieval_results
|
60 |
}
|
61 |
-
|
62 |
actionability_result = evaluate_clinical_actionability([response_data])
|
63 |
actionability_scores.append(actionability_result[0]['overall_score'])
|
64 |
-
|
65 |
evidence_result = evaluate_clinical_evidence([response_data])
|
66 |
evidence_scores.append(evidence_result[0]['overall_score'])
|
67 |
-
|
68 |
# 記錄詳細結果
|
69 |
results["detailed_results"].append({
|
70 |
"query": query,
|
@@ -75,7 +76,7 @@ def run_complete_evaluation(model_name: str, test_cases: List[str]) -> Dict[str,
|
|
75 |
"actionability": actionability_result[0],
|
76 |
"evidence": evidence_result[0]
|
77 |
})
|
78 |
-
|
79 |
# 計算平均指標
|
80 |
results["metrics"] = {
|
81 |
"average_latency": sum(total_latencies) / len(total_latencies),
|
@@ -85,7 +86,7 @@ def run_complete_evaluation(model_name: str, test_cases: List[str]) -> Dict[str,
|
|
85 |
"average_actionability": sum(actionability_scores) / len(actionability_scores),
|
86 |
"average_evidence_score": sum(evidence_scores) / len(evidence_scores)
|
87 |
}
|
88 |
-
|
89 |
return results
|
90 |
```
|
91 |
|
@@ -94,41 +95,43 @@ def run_complete_evaluation(model_name: str, test_cases: List[str]) -> Dict[str,
|
|
94 |
## 📈 評估結果分析框架
|
95 |
|
96 |
### 統計分析
|
|
|
97 |
```python
|
98 |
def analyze_evaluation_results(results_A: Dict, results_B: Dict, results_C: Dict) -> Dict:
|
99 |
"""比較三個模型的評估結果"""
|
100 |
-
|
101 |
models = ['Med42-70B_direct', 'RAG_enhanced', 'OpenBioLLM-70B']
|
102 |
metrics = ['latency', 'extraction_success_rate', 'relevance', 'coverage', 'actionability', 'evidence_score']
|
103 |
-
|
104 |
comparison = {}
|
105 |
-
|
106 |
for metric in metrics:
|
107 |
comparison[metric] = {
|
108 |
models[0]: results_A['metrics'][f'average_{metric}'],
|
109 |
models[1]: results_B['metrics'][f'average_{metric}'],
|
110 |
models[2]: results_C['metrics'][f'average_{metric}']
|
111 |
}
|
112 |
-
|
113 |
# 計算相對改進
|
114 |
baseline = comparison[metric][models[0]]
|
115 |
rag_improvement = ((comparison[metric][models[1]] - baseline) / baseline) * 100
|
116 |
-
|
117 |
comparison[metric]['rag_improvement_percent'] = rag_improvement
|
118 |
-
|
119 |
return comparison
|
120 |
```
|
121 |
|
122 |
### 報告生成
|
|
|
123 |
```python
|
124 |
def generate_evaluation_report(comparison_results: Dict) -> str:
|
125 |
"""生成評估報告"""
|
126 |
-
|
127 |
report = f"""
|
128 |
# OnCall.ai 系統評估報告
|
129 |
-
|
130 |
## 評估摘要
|
131 |
-
|
132 |
| 指標 | Med42-70B | RAG增強版 | OpenBioLLM | RAG改進% |
|
133 |
|------|-----------|-----------|------------|----------|
|
134 |
| 處理時長 | {comparison_results['latency']['Med42-70B_direct']:.2f}s | {comparison_results['latency']['RAG_enhanced']:.2f}s | {comparison_results['latency']['OpenBioLLM-70B']:.2f}s | {comparison_results['latency']['rag_improvement_percent']:+.1f}% |
|
@@ -137,9 +140,9 @@ def generate_evaluation_report(comparison_results: Dict) -> str:
|
|
137 |
| 檢索覆蓋率 | - | {comparison_results['coverage']['RAG_enhanced']:.1%} | - | - |
|
138 |
| 臨床可操作性 | {comparison_results['actionability']['Med42-70B_direct']:.1f}/10 | {comparison_results['actionability']['RAG_enhanced']:.1f}/10 | {comparison_results['actionability']['OpenBioLLM-70B']:.1f}/10 | {comparison_results['actionability']['rag_improvement_percent']:+.1f}% |
|
139 |
| 臨床證據評分 | {comparison_results['evidence_score']['Med42-70B_direct']:.1f}/10 | {comparison_results['evidence_score']['RAG_enhanced']:.1f}/10 | {comparison_results['evidence_score']['OpenBioLLM-70B']:.1f}/10 | {comparison_results['evidence_score']['rag_improvement_percent']:+.1f}% |
|
140 |
-
|
141 |
"""
|
142 |
-
|
143 |
return report
|
144 |
```
|
145 |
|
@@ -148,6 +151,7 @@ def generate_evaluation_report(comparison_results: Dict) -> str:
|
|
148 |
## 🔧 實驗執行步驟
|
149 |
|
150 |
### 1. 環境準備
|
|
|
151 |
```bash
|
152 |
# 設置 HuggingFace token(用於 Inference Providers)
|
153 |
export HF_TOKEN=your_huggingface_token
|
@@ -157,48 +161,49 @@ export ONCALL_EVAL_MODE=true
|
|
157 |
```
|
158 |
|
159 |
### 2. 實驗執行腳本框架
|
|
|
160 |
```python
|
161 |
# evaluation/run_evaluation.py
|
162 |
def main():
|
163 |
"""主要評估執行函數"""
|
164 |
-
|
165 |
# 加載測試用例
|
166 |
test_cases = MEDICAL_TEST_CASES
|
167 |
-
|
168 |
# 實驗 A: YanBo 系統評估
|
169 |
print("🔬 開始實驗 A: YanBo 系統評估")
|
170 |
results_med42_direct = run_complete_evaluation("Med42-70B_direct", test_cases)
|
171 |
-
results_general_rag = run_complete_evaluation("Med42-70B_general_RAG", test_cases)
|
172 |
results_openbio = run_complete_evaluation("OpenBioLLM-70B", test_cases)
|
173 |
-
|
174 |
# 分析和報告
|
175 |
comparison_A = analyze_evaluation_results(results_med42_direct, results_general_rag, results_openbio)
|
176 |
report_A = generate_evaluation_report(comparison_A)
|
177 |
-
|
178 |
# 保存結果
|
179 |
save_results("evaluation/results/yanbo_evaluation.json", {
|
180 |
"comparison": comparison_A,
|
181 |
"detailed_results": [results_med42_direct, results_general_rag, results_openbio]
|
182 |
})
|
183 |
-
|
184 |
print("✅ 實驗 A 完成,結果已保存")
|
185 |
-
|
186 |
# 實驗 B: Jeff 系統評估
|
187 |
print("🔬 開始實驗 B: Jeff 系統評估")
|
188 |
results_med42_direct_b = run_complete_evaluation("Med42-70B_direct", test_cases)
|
189 |
results_customized_rag = run_complete_evaluation("Med42-70B_customized_RAG", test_cases)
|
190 |
results_openbio_b = run_complete_evaluation("OpenBioLLM-70B", test_cases)
|
191 |
-
|
192 |
# 分析和報告
|
193 |
comparison_B = analyze_evaluation_results(results_med42_direct_b, results_customized_rag, results_openbio_b)
|
194 |
report_B = generate_evaluation_report(comparison_B)
|
195 |
-
|
196 |
# 保存結果
|
197 |
save_results("evaluation/results/jeff_evaluation.json", {
|
198 |
"comparison": comparison_B,
|
199 |
"detailed_results": [results_med42_direct_b, results_customized_rag, results_openbio_b]
|
200 |
})
|
201 |
-
|
202 |
print("✅ 實驗 B 完成,結果已保存")
|
203 |
|
204 |
if __name__ == "__main__":
|
@@ -206,6 +211,7 @@ if __name__ == "__main__":
|
|
206 |
```
|
207 |
|
208 |
### 3. 預期評估時間
|
|
|
209 |
```
|
210 |
總評估時間估算:
|
211 |
├── 每個查詢處理時間:~30秒(包含LLM評估)
|
@@ -219,10 +225,11 @@ if __name__ == "__main__":
|
|
219 |
## 📊 評估成功標準
|
220 |
|
221 |
### 系統性能目標
|
|
|
222 |
```
|
223 |
✅ 達標條件:
|
224 |
1. 總處理時長 ≤ 30秒
|
225 |
-
2. 條件抽取成功率 ≥ 80%
|
226 |
3. 檢索相關性 ≥ 0.2
|
227 |
4. 檢索覆蓋率 ≥ 60%
|
228 |
5. 臨床可操作性 ≥ 7.0/10
|
@@ -234,6 +241,7 @@ if __name__ == "__main__":
|
|
234 |
```
|
235 |
|
236 |
### 比較分析重點
|
|
|
237 |
```
|
238 |
重點分析維度:
|
239 |
├── RAG 對處理時間的影響(可能增加延遲)
|
@@ -247,6 +255,7 @@ if __name__ == "__main__":
|
|
247 |
## 🛠️ 實施建議
|
248 |
|
249 |
### 分階段實施
|
|
|
250 |
```
|
251 |
階段1: 基礎指標實現(1-4項)
|
252 |
├── 利用現有 app.py 中的時間測量
|
@@ -268,6 +277,7 @@ if __name__ == "__main__":
|
|
268 |
```
|
269 |
|
270 |
### 實施注意事項
|
|
|
271 |
```
|
272 |
⚠️ 重要提醒:
|
273 |
1. 所有評估代碼應獨立於現有系統,避免影響正常運行
|
@@ -280,3 +290,87 @@ if __name__ == "__main__":
|
|
280 |
---
|
281 |
|
282 |
**評估指南完成。請根據此指南實施評估實驗。**
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
# Model use
|
2 |
+
|
3 |
llm model: (for comparison) with our-own version.
|
4 |
https://huggingface.co/aaditya/Llama3-OpenBioLLM-70B
|
5 |
https://huggingface.co/m42-health/Llama3-Med42-70B
|
|
|
13 |
"""
|
14 |
```
|
15 |
|
|
|
16 |
### 評估執行流程
|
17 |
+
|
18 |
```python
|
19 |
def run_complete_evaluation(model_name: str, test_cases: List[str]) -> Dict[str, Any]:
|
20 |
"""執行完整的六項指標評估"""
|
21 |
+
|
22 |
results = {
|
23 |
"model": model_name,
|
24 |
"metrics": {},
|
25 |
"detailed_results": []
|
26 |
}
|
27 |
+
|
28 |
total_latencies = []
|
29 |
extraction_successes = []
|
30 |
relevance_scores = []
|
31 |
coverage_scores = []
|
32 |
actionability_scores = []
|
33 |
evidence_scores = []
|
34 |
+
|
35 |
for query in test_cases:
|
36 |
# 運行模型並測量所有指標
|
37 |
start_time = time.time()
|
38 |
+
|
39 |
# 1. 總處理時長
|
40 |
latency_result = measure_total_latency(query)
|
41 |
total_latencies.append(latency_result['total_latency'])
|
42 |
+
|
43 |
# 2. 條件抽取成功率
|
44 |
extraction_result = evaluate_condition_extraction([query])
|
45 |
extraction_successes.append(extraction_result['success_rate'])
|
46 |
+
|
47 |
# 3 & 4. 檢索相關性和覆蓋率(需要實際檢索結果)
|
48 |
retrieval_results = get_retrieval_results(query)
|
49 |
relevance_result = evaluate_retrieval_relevance(retrieval_results)
|
50 |
relevance_scores.append(relevance_result['average_relevance'])
|
51 |
+
|
52 |
generated_advice = get_generated_advice(query, retrieval_results)
|
53 |
coverage_result = evaluate_retrieval_coverage(generated_advice, retrieval_results)
|
54 |
coverage_scores.append(coverage_result['coverage'])
|
55 |
+
|
56 |
# 5 & 6. LLM 評估(需要完整回應)
|
57 |
response_data = {
|
58 |
'query': query,
|
59 |
'advice': generated_advice,
|
60 |
'retrieval_results': retrieval_results
|
61 |
}
|
62 |
+
|
63 |
actionability_result = evaluate_clinical_actionability([response_data])
|
64 |
actionability_scores.append(actionability_result[0]['overall_score'])
|
65 |
+
|
66 |
evidence_result = evaluate_clinical_evidence([response_data])
|
67 |
evidence_scores.append(evidence_result[0]['overall_score'])
|
68 |
+
|
69 |
# 記錄詳細結果
|
70 |
results["detailed_results"].append({
|
71 |
"query": query,
|
|
|
76 |
"actionability": actionability_result[0],
|
77 |
"evidence": evidence_result[0]
|
78 |
})
|
79 |
+
|
80 |
# 計算平均指標
|
81 |
results["metrics"] = {
|
82 |
"average_latency": sum(total_latencies) / len(total_latencies),
|
|
|
86 |
"average_actionability": sum(actionability_scores) / len(actionability_scores),
|
87 |
"average_evidence_score": sum(evidence_scores) / len(evidence_scores)
|
88 |
}
|
89 |
+
|
90 |
return results
|
91 |
```
|
92 |
|
|
|
95 |
## 📈 評估結果分析框架
|
96 |
|
97 |
### 統計分析
|
98 |
+
|
99 |
```python
|
100 |
def analyze_evaluation_results(results_A: Dict, results_B: Dict, results_C: Dict) -> Dict:
|
101 |
"""比較三個模型的評估結果"""
|
102 |
+
|
103 |
models = ['Med42-70B_direct', 'RAG_enhanced', 'OpenBioLLM-70B']
|
104 |
metrics = ['latency', 'extraction_success_rate', 'relevance', 'coverage', 'actionability', 'evidence_score']
|
105 |
+
|
106 |
comparison = {}
|
107 |
+
|
108 |
for metric in metrics:
|
109 |
comparison[metric] = {
|
110 |
models[0]: results_A['metrics'][f'average_{metric}'],
|
111 |
models[1]: results_B['metrics'][f'average_{metric}'],
|
112 |
models[2]: results_C['metrics'][f'average_{metric}']
|
113 |
}
|
114 |
+
|
115 |
# 計算相對改進
|
116 |
baseline = comparison[metric][models[0]]
|
117 |
rag_improvement = ((comparison[metric][models[1]] - baseline) / baseline) * 100
|
118 |
+
|
119 |
comparison[metric]['rag_improvement_percent'] = rag_improvement
|
120 |
+
|
121 |
return comparison
|
122 |
```
|
123 |
|
124 |
### 報告生成
|
125 |
+
|
126 |
```python
|
127 |
def generate_evaluation_report(comparison_results: Dict) -> str:
|
128 |
"""生成評估報告"""
|
129 |
+
|
130 |
report = f"""
|
131 |
# OnCall.ai 系統評估報告
|
132 |
+
|
133 |
## 評估摘要
|
134 |
+
|
135 |
| 指標 | Med42-70B | RAG增強版 | OpenBioLLM | RAG改進% |
|
136 |
|------|-----------|-----------|------------|----------|
|
137 |
| 處理時長 | {comparison_results['latency']['Med42-70B_direct']:.2f}s | {comparison_results['latency']['RAG_enhanced']:.2f}s | {comparison_results['latency']['OpenBioLLM-70B']:.2f}s | {comparison_results['latency']['rag_improvement_percent']:+.1f}% |
|
|
|
140 |
| 檢索覆蓋率 | - | {comparison_results['coverage']['RAG_enhanced']:.1%} | - | - |
|
141 |
| 臨床可操作性 | {comparison_results['actionability']['Med42-70B_direct']:.1f}/10 | {comparison_results['actionability']['RAG_enhanced']:.1f}/10 | {comparison_results['actionability']['OpenBioLLM-70B']:.1f}/10 | {comparison_results['actionability']['rag_improvement_percent']:+.1f}% |
|
142 |
| 臨床證據評分 | {comparison_results['evidence_score']['Med42-70B_direct']:.1f}/10 | {comparison_results['evidence_score']['RAG_enhanced']:.1f}/10 | {comparison_results['evidence_score']['OpenBioLLM-70B']:.1f}/10 | {comparison_results['evidence_score']['rag_improvement_percent']:+.1f}% |
|
143 |
+
|
144 |
"""
|
145 |
+
|
146 |
return report
|
147 |
```
|
148 |
|
|
|
151 |
## 🔧 實驗執行步驟
|
152 |
|
153 |
### 1. 環境準備
|
154 |
+
|
155 |
```bash
|
156 |
# 設置 HuggingFace token(用於 Inference Providers)
|
157 |
export HF_TOKEN=your_huggingface_token
|
|
|
161 |
```
|
162 |
|
163 |
### 2. 實驗執行腳本框架
|
164 |
+
|
165 |
```python
|
166 |
# evaluation/run_evaluation.py
|
167 |
def main():
|
168 |
"""主要評估執行函數"""
|
169 |
+
|
170 |
# 加載測試用例
|
171 |
test_cases = MEDICAL_TEST_CASES
|
172 |
+
|
173 |
# 實驗 A: YanBo 系統評估
|
174 |
print("🔬 開始實驗 A: YanBo 系統評估")
|
175 |
results_med42_direct = run_complete_evaluation("Med42-70B_direct", test_cases)
|
176 |
+
results_general_rag = run_complete_evaluation("Med42-70B_general_RAG", test_cases)
|
177 |
results_openbio = run_complete_evaluation("OpenBioLLM-70B", test_cases)
|
178 |
+
|
179 |
# 分析和報告
|
180 |
comparison_A = analyze_evaluation_results(results_med42_direct, results_general_rag, results_openbio)
|
181 |
report_A = generate_evaluation_report(comparison_A)
|
182 |
+
|
183 |
# 保存結果
|
184 |
save_results("evaluation/results/yanbo_evaluation.json", {
|
185 |
"comparison": comparison_A,
|
186 |
"detailed_results": [results_med42_direct, results_general_rag, results_openbio]
|
187 |
})
|
188 |
+
|
189 |
print("✅ 實驗 A 完成,結果已保存")
|
190 |
+
|
191 |
# 實驗 B: Jeff 系統評估
|
192 |
print("🔬 開始實驗 B: Jeff 系統評估")
|
193 |
results_med42_direct_b = run_complete_evaluation("Med42-70B_direct", test_cases)
|
194 |
results_customized_rag = run_complete_evaluation("Med42-70B_customized_RAG", test_cases)
|
195 |
results_openbio_b = run_complete_evaluation("OpenBioLLM-70B", test_cases)
|
196 |
+
|
197 |
# 分析和報告
|
198 |
comparison_B = analyze_evaluation_results(results_med42_direct_b, results_customized_rag, results_openbio_b)
|
199 |
report_B = generate_evaluation_report(comparison_B)
|
200 |
+
|
201 |
# 保存結果
|
202 |
save_results("evaluation/results/jeff_evaluation.json", {
|
203 |
"comparison": comparison_B,
|
204 |
"detailed_results": [results_med42_direct_b, results_customized_rag, results_openbio_b]
|
205 |
})
|
206 |
+
|
207 |
print("✅ 實驗 B 完成,結果已保存")
|
208 |
|
209 |
if __name__ == "__main__":
|
|
|
211 |
```
|
212 |
|
213 |
### 3. 預期評估時間
|
214 |
+
|
215 |
```
|
216 |
總評估時間估算:
|
217 |
├── 每個查詢處理時間:~30秒(包含LLM評估)
|
|
|
225 |
## 📊 評估成功標準
|
226 |
|
227 |
### 系統性能目標
|
228 |
+
|
229 |
```
|
230 |
✅ 達標條件:
|
231 |
1. 總處理時長 ≤ 30秒
|
232 |
+
2. 條件抽取成功率 ≥ 80%
|
233 |
3. 檢索相關性 ≥ 0.2
|
234 |
4. 檢索覆蓋率 ≥ 60%
|
235 |
5. 臨床可操作性 ≥ 7.0/10
|
|
|
241 |
```
|
242 |
|
243 |
### 比較分析重點
|
244 |
+
|
245 |
```
|
246 |
重點分析維度:
|
247 |
├── RAG 對處理時間的影響(可能增加延遲)
|
|
|
255 |
## 🛠️ 實施建議
|
256 |
|
257 |
### 分階段實施
|
258 |
+
|
259 |
```
|
260 |
階段1: 基礎指標實現(1-4項)
|
261 |
├── 利用現有 app.py 中的時間測量
|
|
|
277 |
```
|
278 |
|
279 |
### 實施注意事項
|
280 |
+
|
281 |
```
|
282 |
⚠️ 重要提醒:
|
283 |
1. 所有評估代碼應獨立於現有系統,避免影響正常運行
|
|
|
290 |
---
|
291 |
|
292 |
**評估指南完成。請根據此指南實施評估實驗。**
|
293 |
+
|
294 |
+
## Phase 1: Initial Assessment
|
295 |
+
|
296 |
+
### Step 1.1 - 分析您的說明
|
297 |
+
|
298 |
+
我重新理解了您的意思!
|
299 |
+
|
300 |
+
### Step 1.2 - 無不清楚的需求
|
301 |
+
|
302 |
+
### Step 1.3 - 確認理解(中文)
|
303 |
+
|
304 |
+
完全明白了!您的評估架構是:
|
305 |
+
|
306 |
+
## 🎯 **評估指標的測試層級**
|
307 |
+
|
308 |
+
### **單獨測試 Retrieval(指標 2, 3, 4)**
|
309 |
+
|
310 |
+
```python
|
311 |
+
# 只測試 RAG 系統,因為只有 RAG 系統有這些組件:
|
312 |
+
retrieval_only_metrics = [
|
313 |
+
"條件抽取成功率", # 只有您的系統有 user_prompt.py
|
314 |
+
"檢索相關性", # 只有 RAG 系統有檢索結果
|
315 |
+
"檢索覆蓋率" # 只有 RAG 系統有檢索→生成的映射
|
316 |
+
]
|
317 |
+
|
318 |
+
# 測試對象:
|
319 |
+
- Med42-70B_general_RAG(您的系統)✅
|
320 |
+
- Med42-70B_customized_RAG(Jeff系統)✅
|
321 |
+
- Med42-70B_direct(無RAG)❌ 沒有檢索組件
|
322 |
+
- OpenBioLLM-70B(無RAG)❌ 沒有檢索組件
|
323 |
+
```
|
324 |
+
|
325 |
+
### **三模型一起測試(指標 1, 5, 6)**
|
326 |
+
|
327 |
+
```python
|
328 |
+
# 所有模型都能測試的指標:
|
329 |
+
universal_metrics = [
|
330 |
+
"總處理時長", # 所有模型都有響應時間
|
331 |
+
"臨床可操作性", # Llama3-70B 評估所有模型的輸出
|
332 |
+
"臨床證據評分" # Llama3-70B 評估所有模型的輸出
|
333 |
+
]
|
334 |
+
|
335 |
+
# 測試對象:
|
336 |
+
- Med42-70B_direct ✅
|
337 |
+
- Med42-70B_general_RAG ✅
|
338 |
+
- OpenBioLLM-70B ✅
|
339 |
+
```
|
340 |
+
|
341 |
+
## 📊 **分層評估策略**
|
342 |
+
|
343 |
+
### **階段 1:RAG 系統內部評估**
|
344 |
+
|
345 |
+
```python
|
346 |
+
# 只測試有 RAG 的系統
|
347 |
+
rag_systems = ["Med42-70B_general_RAG", "Med42-70B_customized_RAG"]
|
348 |
+
|
349 |
+
# 測試 RAG 特有指標
|
350 |
+
for system in rag_systems:
|
351 |
+
evaluate_retrieval_metrics(system) # 指標 2, 3, 4
|
352 |
+
```
|
353 |
+
|
354 |
+
### **階段 2:全模型比較評估**
|
355 |
+
|
356 |
+
```python
|
357 |
+
# 測試所有三個模型
|
358 |
+
all_systems = ["Med42-70B_direct", "Med42-70B_general_RAG", "OpenBioLLM-70B"]
|
359 |
+
|
360 |
+
# 測試通用指標
|
361 |
+
for system in all_systems:
|
362 |
+
evaluate_universal_metrics(system) # 指標 1, 5, 6
|
363 |
+
```
|
364 |
+
|
365 |
+
### **階段 3:綜合分析**
|
366 |
+
|
367 |
+
```python
|
368 |
+
# 合併兩階段結果,生成完整報告
|
369 |
+
combine_evaluation_results()
|
370 |
+
```
|
371 |
+
|
372 |
+
### Step 1.4 - 無視覺修改確認 ✅
|
373 |
+
|
374 |
+
此為評估策略討論,無涉及代碼修改。
|
375 |
+
|
376 |
+
**您的理解完全正確!RAG 特有的指標只能在 RAG 系統內部測試,而通用指標可以跨所有模型比較。這樣的分層評估策略非常合理!**
|