metadata
language: en
license: apache-2.0
Model Card
Metrics
- position: -1
- layer: 31
- refusal_score: -9.527278900146484
- refusal_score_baseline: 6.786408424377441
- steering_score: 2.9693429470062256
- steering_score_baseline: -11.083024978637695
- kl_div_score: 0.04789839362178634
- no_filter: 11
- nan_values: 0
- late_layer: 45
- high_kl: 87
- low_refusal: 67